v8
V8 is Google’s open source high-performance JavaScript and WebAssembly engine, written in C++.
Loading...
Searching...
No Matches
regexp-parser.cc
Go to the documentation of this file.
1// Copyright 2016 the V8 project authors. All rights reserved.
2// Use of this source code is governed by a BSD-style license that can be
3// found in the LICENSE file.
4
6
11#include "src/regexp/regexp.h"
13#include "src/utils/ostreams.h"
14#include "src/utils/utils.h"
17
18#ifdef V8_INTL_SUPPORT
19#include "unicode/uniset.h"
20#include "unicode/unistr.h"
21#include "unicode/usetiter.h"
22#include "unicode/utf16.h" // For U16_NEXT
23#endif // V8_INTL_SUPPORT
24
25namespace v8 {
26namespace internal {
27
28namespace {
29
30// Whether we're currently inside the ClassEscape production
31// (tc39.es/ecma262/#prod-annexB-CharacterEscape).
32enum class InClassEscapeState {
33 kInClass,
34 kNotInClass,
35};
36
37// The production used to derive ClassSetOperand.
38enum class ClassSetOperandType {
39 kClassSetCharacter,
40 kClassStringDisjunction,
41 kNestedClass,
42 kCharacterClassEscape, // \ CharacterClassEscape is a special nested class,
43 // as we can fold it directly into another range.
44 kClassSetRange
45};
46
47class RegExpTextBuilder {
48 public:
49 using SmallRegExpTreeVector = SmallZoneVector<RegExpTree*, 8>;
50
51 RegExpTextBuilder(Zone* zone, SmallRegExpTreeVector* terms_storage,
52 RegExpFlags flags)
53 : zone_(zone), flags_(flags), terms_(terms_storage), text_(zone) {}
54 void AddCharacter(base::uc16 character);
55 void AddUnicodeCharacter(base::uc32 character);
56 void AddEscapedUnicodeCharacter(base::uc32 character);
57 void AddAtom(RegExpTree* atom);
58 void AddTerm(RegExpTree* term);
59 void AddClassRanges(RegExpClassRanges* cc);
60 void FlushPendingSurrogate();
61 void FlushText();
62 RegExpTree* PopLastAtom();
63 RegExpTree* ToRegExp();
64
65 private:
67
68 void AddLeadSurrogate(base::uc16 lead_surrogate);
69 void AddTrailSurrogate(base::uc16 trail_surrogate);
70 void FlushCharacters();
71 bool NeedsDesugaringForUnicode(RegExpClassRanges* cc);
72 bool NeedsDesugaringForIgnoreCase(base::uc32 c);
73 void AddClassRangesForDesugaring(base::uc32 c);
74 bool ignore_case() const { return IsIgnoreCase(flags_); }
75 bool IsUnicodeMode() const {
76 // Either /v or /u enable UnicodeMode
77 // https://tc39.es/ecma262/#sec-parsepattern
78 return IsUnicode(flags_) || IsUnicodeSets(flags_);
79 }
80 Zone* zone() const { return zone_; }
81
82 Zone* const zone_;
84 ZoneList<base::uc16>* characters_ = nullptr;
86 SmallRegExpTreeVector* terms_;
87 SmallRegExpTreeVector text_;
88};
89
90void RegExpTextBuilder::AddLeadSurrogate(base::uc16 lead_surrogate) {
92 FlushPendingSurrogate();
93 // Hold onto the lead surrogate, waiting for a trail surrogate to follow.
94 pending_surrogate_ = lead_surrogate;
95}
96
97void RegExpTextBuilder::AddTrailSurrogate(base::uc16 trail_surrogate) {
100 base::uc16 lead_surrogate = pending_surrogate_;
103 base::uc32 combined =
104 unibrow::Utf16::CombineSurrogatePair(lead_surrogate, trail_surrogate);
105 if (NeedsDesugaringForIgnoreCase(combined)) {
106 AddClassRangesForDesugaring(combined);
107 } else {
108 ZoneList<base::uc16> surrogate_pair(2, zone());
109 surrogate_pair.Add(lead_surrogate, zone());
110 surrogate_pair.Add(trail_surrogate, zone());
111 RegExpAtom* atom =
112 zone()->New<RegExpAtom>(surrogate_pair.ToConstVector());
113 AddAtom(atom);
114 }
115 } else {
116 pending_surrogate_ = trail_surrogate;
117 FlushPendingSurrogate();
118 }
119}
120
121void RegExpTextBuilder::FlushPendingSurrogate() {
123 DCHECK(IsUnicodeMode());
124 base::uc32 c = pending_surrogate_;
126 AddClassRangesForDesugaring(c);
127 }
128}
129
130void RegExpTextBuilder::FlushCharacters() {
131 FlushPendingSurrogate();
132 if (characters_ != nullptr) {
133 RegExpTree* atom = zone()->New<RegExpAtom>(characters_->ToConstVector());
134 characters_ = nullptr;
135 text_.emplace_back(atom);
136 }
137}
138
139void RegExpTextBuilder::FlushText() {
140 FlushCharacters();
141 size_t num_text = text_.size();
142 if (num_text == 0) {
143 return;
144 } else if (num_text == 1) {
145 terms_->emplace_back(text_.back());
146 } else {
147 RegExpText* text = zone()->New<RegExpText>(zone());
148 for (size_t i = 0; i < num_text; i++) {
149 text_[i]->AppendToText(text, zone());
150 }
151 terms_->emplace_back(text);
152 }
153 text_.clear();
154}
155
156void RegExpTextBuilder::AddCharacter(base::uc16 c) {
157 FlushPendingSurrogate();
158 if (characters_ == nullptr) {
159 characters_ = zone()->New<ZoneList<base::uc16>>(4, zone());
160 }
161 characters_->Add(c, zone());
162}
163
164void RegExpTextBuilder::AddUnicodeCharacter(base::uc32 c) {
165 if (c > static_cast<base::uc32>(unibrow::Utf16::kMaxNonSurrogateCharCode)) {
166 DCHECK(IsUnicodeMode());
167 AddLeadSurrogate(unibrow::Utf16::LeadSurrogate(c));
168 AddTrailSurrogate(unibrow::Utf16::TrailSurrogate(c));
169 } else if (IsUnicodeMode() && unibrow::Utf16::IsLeadSurrogate(c)) {
170 AddLeadSurrogate(c);
171 } else if (IsUnicodeMode() && unibrow::Utf16::IsTrailSurrogate(c)) {
172 AddTrailSurrogate(c);
173 } else {
174 AddCharacter(static_cast<base::uc16>(c));
175 }
176}
177
178void RegExpTextBuilder::AddEscapedUnicodeCharacter(base::uc32 character) {
179 // A lead or trail surrogate parsed via escape sequence will not
180 // pair up with any preceding lead or following trail surrogate.
181 FlushPendingSurrogate();
182 AddUnicodeCharacter(character);
183 FlushPendingSurrogate();
184}
185
186void RegExpTextBuilder::AddClassRanges(RegExpClassRanges* cr) {
187 if (NeedsDesugaringForUnicode(cr)) {
188 // With /u or /v, character class needs to be desugared, so it
189 // must be a standalone term instead of being part of a RegExpText.
190 AddTerm(cr);
191 } else {
192 AddAtom(cr);
193 }
194}
195
196void RegExpTextBuilder::AddClassRangesForDesugaring(base::uc32 c) {
197 AddTerm(zone()->New<RegExpClassRanges>(
198 zone(), CharacterRange::List(zone(), CharacterRange::Singleton(c))));
199}
200
201void RegExpTextBuilder::AddAtom(RegExpTree* atom) {
202 DCHECK(atom->IsTextElement());
203 FlushCharacters();
204 text_.emplace_back(atom);
205}
206
207void RegExpTextBuilder::AddTerm(RegExpTree* term) {
208 DCHECK(term->IsTextElement());
209 FlushText();
210 terms_->emplace_back(term);
211}
212
213bool RegExpTextBuilder::NeedsDesugaringForUnicode(RegExpClassRanges* cc) {
214 if (!IsUnicodeMode()) return false;
215 // TODO(yangguo): we could be smarter than this. Case-insensitivity does not
216 // necessarily mean that we need to desugar. It's probably nicer to have a
217 // separate pass to figure out unicode desugarings.
218 if (ignore_case()) return true;
219 ZoneList<CharacterRange>* ranges = cc->ranges(zone());
220 CharacterRange::Canonicalize(ranges);
221
222 if (cc->is_negated()) {
223 ZoneList<CharacterRange>* negated_ranges =
224 zone()->New<ZoneList<CharacterRange>>(ranges->length(), zone());
225 CharacterRange::Negate(ranges, negated_ranges, zone());
226 ranges = negated_ranges;
227 }
228
229 for (int i = ranges->length() - 1; i >= 0; i--) {
230 base::uc32 from = ranges->at(i).from();
231 base::uc32 to = ranges->at(i).to();
232 // Check for non-BMP characters.
233 if (to >= kNonBmpStart) return true;
234 // Check for lone surrogates.
235 if (from <= kTrailSurrogateEnd && to >= kLeadSurrogateStart) return true;
236 }
237 return false;
238}
239
240// We only use this for characters made of surrogate pairs. All other
241// characters outside of character classes are made case independent in the
242// code generation.
243bool RegExpTextBuilder::NeedsDesugaringForIgnoreCase(base::uc32 c) {
244#ifdef V8_INTL_SUPPORT
245 if (IsUnicodeMode() && ignore_case()) {
246 icu::UnicodeSet set(c, c);
247 set.closeOver(USET_CASE_INSENSITIVE);
248 set.removeAllStrings();
249 return set.size() > 1;
250 }
251 // In the case where ICU is not included, we act as if the unicode flag is
252 // not set, and do not desugar.
253#endif // V8_INTL_SUPPORT
254 return false;
255}
256
257RegExpTree* RegExpTextBuilder::PopLastAtom() {
258 FlushPendingSurrogate();
259 RegExpTree* atom;
260 if (characters_ != nullptr) {
261 base::Vector<const base::uc16> char_vector = characters_->ToConstVector();
262 int num_chars = char_vector.length();
263 if (num_chars > 1) {
264 base::Vector<const base::uc16> prefix =
265 char_vector.SubVector(0, num_chars - 1);
266 text_.emplace_back(zone()->New<RegExpAtom>(prefix));
267 char_vector = char_vector.SubVector(num_chars - 1, num_chars);
268 }
269 characters_ = nullptr;
270 atom = zone()->New<RegExpAtom>(char_vector);
271 return atom;
272 } else if (!text_.empty()) {
273 atom = text_.back();
274 text_.pop_back();
275 return atom;
276 }
277 return nullptr;
278}
279
280RegExpTree* RegExpTextBuilder::ToRegExp() {
281 FlushText();
282 size_t num_alternatives = terms_->size();
283 if (num_alternatives == 0) return zone()->New<RegExpEmpty>();
284 if (num_alternatives == 1) return terms_->back();
285 return zone()->New<RegExpAlternative>(zone()->New<ZoneList<RegExpTree*>>(
286 base::VectorOf(terms_->begin(), terms_->size()), zone()));
287}
288
289// Accumulates RegExp atoms and assertions into lists of terms and alternatives.
290class RegExpBuilder {
291 public:
292 RegExpBuilder(Zone* zone, RegExpFlags flags)
293 : zone_(zone),
294 flags_(flags),
295 terms_(zone),
296 alternatives_(zone),
297 text_builder_(RegExpTextBuilder{zone, &terms_, flags}) {}
298 void AddCharacter(base::uc16 character);
299 void AddUnicodeCharacter(base::uc32 character);
300 void AddEscapedUnicodeCharacter(base::uc32 character);
301 // "Adds" an empty expression. Does nothing except consume a
302 // following quantifier
303 void AddEmpty();
304 void AddClassRanges(RegExpClassRanges* cc);
305 void AddAtom(RegExpTree* tree);
306 void AddTerm(RegExpTree* tree);
307 void AddAssertion(RegExpTree* tree);
308 void NewAlternative(); // '|'
309 bool AddQuantifierToAtom(int min, int max, int index,
310 RegExpQuantifier::QuantifierType type);
311 void FlushText();
312 RegExpTree* ToRegExp();
313 RegExpFlags flags() const { return flags_; }
314
315 bool ignore_case() const { return IsIgnoreCase(flags_); }
316 bool multiline() const { return IsMultiline(flags_); }
317 bool dotall() const { return IsDotAll(flags_); }
318
319 private:
320 void FlushTerms();
321 bool IsUnicodeMode() const {
322 // Either /v or /u enable UnicodeMode
323 // https://tc39.es/ecma262/#sec-parsepattern
324 return IsUnicode(flags_) || IsUnicodeSets(flags_);
325 }
326 Zone* zone() const { return zone_; }
327 RegExpTextBuilder& text_builder() { return text_builder_; }
328
329 Zone* const zone_;
330 bool pending_empty_ = false;
331 const RegExpFlags flags_;
332
333 using SmallRegExpTreeVector = SmallZoneVector<RegExpTree*, 8>;
334 SmallRegExpTreeVector terms_;
335 SmallRegExpTreeVector alternatives_;
336 RegExpTextBuilder text_builder_;
337};
338
339enum SubexpressionType {
340 INITIAL,
341 CAPTURE, // All positive values represent captures.
342 POSITIVE_LOOKAROUND,
343 NEGATIVE_LOOKAROUND,
344 GROUPING
345};
346
347class RegExpParserState : public ZoneObject {
348 public:
349 // Push a state on the stack.
350 RegExpParserState(RegExpParserState* previous_state,
351 SubexpressionType group_type,
352 RegExpLookaround::Type lookaround_type,
353 int disjunction_capture_index,
354 const ZoneVector<base::uc16>* capture_name,
355 RegExpFlags flags, Zone* zone)
356 : previous_state_(previous_state),
357 builder_(zone, flags),
358 group_type_(group_type),
359 lookaround_type_(lookaround_type),
360 disjunction_capture_index_(disjunction_capture_index),
361 capture_name_(capture_name) {
362 if (previous_state != nullptr) {
364 previous_state->non_participating_capture_group_interval();
365 }
366 }
367 // Parser state of containing expression, if any.
368 RegExpParserState* previous_state() const { return previous_state_; }
369 bool IsSubexpression() { return previous_state_ != nullptr; }
370 // RegExpBuilder building this regexp's AST.
371 RegExpBuilder* builder() { return &builder_; }
372 // Type of regexp being parsed (parenthesized group or entire regexp).
373 SubexpressionType group_type() const { return group_type_; }
374 // Lookahead or Lookbehind.
375 RegExpLookaround::Type lookaround_type() const { return lookaround_type_; }
376 // Index in captures array of first capture in this sub-expression, if any.
377 // Also the capture index of this sub-expression itself, if group_type
378 // is CAPTURE.
379 int capture_index() const { return disjunction_capture_index_; }
380 // The name of the current sub-expression, if group_type is CAPTURE. Only
381 // used for named captures.
382 const ZoneVector<base::uc16>* capture_name() const { return capture_name_; }
383 std::pair<int, int> non_participating_capture_group_interval() const {
385 }
386
387 bool IsNamedCapture() const { return capture_name_ != nullptr; }
388
389 // Check whether the parser is inside a capture group with the given index.
390 bool IsInsideCaptureGroup(int index) const {
391 for (const RegExpParserState* s = this; s != nullptr;
392 s = s->previous_state()) {
393 if (s->group_type() != CAPTURE) continue;
394 // Return true if we found the matching capture index.
395 if (index == s->capture_index()) return true;
396 // Abort if index is larger than what has been parsed up till this state.
397 if (index > s->capture_index()) return false;
398 }
399 return false;
400 }
401
402 // Check whether the parser is inside a capture group with the given name.
403 bool IsInsideCaptureGroup(const ZoneVector<base::uc16>* name) const {
404 DCHECK_NOT_NULL(name);
405 for (const RegExpParserState* s = this; s != nullptr;
406 s = s->previous_state()) {
407 if (s->capture_name() == nullptr) continue;
408 if (*s->capture_name() == *name) return true;
409 }
410 return false;
411 }
412
413 void NewAlternative(int captures_started) {
414 if (non_participating_capture_group_interval().second != 0) {
415 // Extend the non-participating interval.
416 non_participating_capture_group_interval_.second = captures_started;
417 } else {
418 // Create new non-participating interval from the start of the current
419 // enclosing group to all captures created within that group so far.
421 std::make_pair(capture_index(), captures_started);
422 }
423 }
424
425 private:
426 // Linked list implementation of stack of states.
427 RegExpParserState* const previous_state_;
428 // Builder for the stored disjunction.
429 RegExpBuilder builder_;
430 // Stored disjunction type (capture, look-ahead or grouping), if any.
431 const SubexpressionType group_type_;
432 // Stored read direction.
433 const RegExpLookaround::Type lookaround_type_;
434 // Stored disjunction's capture index (if any).
436 // Stored capture name (if any).
437 const ZoneVector<base::uc16>* const capture_name_;
438 // Interval of (named) capture indices ]from, to] that are not participating
439 // in the current state (i.e. they cannot match).
440 // Capture indices are not participating if they were created in a different
441 // alternative.
443};
444
445template <class CharT>
446class RegExpParserImpl final {
447 private:
448 RegExpParserImpl(const CharT* input, int input_length, RegExpFlags flags,
449 uintptr_t stack_limit, Zone* zone,
450 const DisallowGarbageCollection& no_gc);
451
452 bool Parse(RegExpCompileData* result);
453
454 RegExpTree* ParsePattern();
455 RegExpTree* ParseDisjunction();
456 RegExpTree* ParseGroup();
457
458 // Parses a {...,...} quantifier and stores the range in the given
459 // out parameters.
460 bool ParseIntervalQuantifier(int* min_out, int* max_out);
461
462 // Checks whether the following is a length-digit hexadecimal number,
463 // and sets the value if it is.
464 bool ParseHexEscape(int length, base::uc32* value);
465 bool ParseUnicodeEscape(base::uc32* value);
466 bool ParseUnlimitedLengthHexNumber(int max_value, base::uc32* value);
467
468 bool ParsePropertyClassName(ZoneVector<char>* name_1,
469 ZoneVector<char>* name_2);
470 bool AddPropertyClassRange(ZoneList<CharacterRange>* add_to_range,
471 CharacterClassStrings* add_to_strings, bool negate,
472 const ZoneVector<char>& name_1,
473 const ZoneVector<char>& name_2);
474
475 RegExpTree* ParseClassRanges(ZoneList<CharacterRange>* ranges,
476 bool add_unicode_case_equivalents);
477 // Parse inside a class. Either add escaped class to the range, or return
478 // false and pass parsed single character through |char_out|.
479 void ParseClassEscape(ZoneList<CharacterRange>* ranges, Zone* zone,
480 bool add_unicode_case_equivalents, base::uc32* char_out,
481 bool* is_class_escape);
482 // Returns true iff parsing was successful.
483 bool TryParseCharacterClassEscape(base::uc32 next,
484 InClassEscapeState in_class_escape_state,
485 ZoneList<CharacterRange>* ranges,
486 CharacterClassStrings* strings, Zone* zone,
487 bool add_unicode_case_equivalents);
488 RegExpTree* ParseClassStringDisjunction(ZoneList<CharacterRange>* ranges,
489 CharacterClassStrings* strings);
490 RegExpTree* ParseClassSetOperand(const RegExpBuilder* builder,
491 ClassSetOperandType* type_out);
492 RegExpTree* ParseClassSetOperand(const RegExpBuilder* builder,
493 ClassSetOperandType* type_out,
494 ZoneList<CharacterRange>* ranges,
495 CharacterClassStrings* strings,
496 base::uc32* character);
497 base::uc32 ParseClassSetCharacter();
498 // Parses and returns a single escaped character.
499 base::uc32 ParseCharacterEscape(InClassEscapeState in_class_escape_state,
500 bool* is_escaped_unicode_character);
501
502 void AddMaybeSimpleCaseFoldedRange(ZoneList<CharacterRange>* ranges,
503 CharacterRange new_range);
504
505 RegExpTree* ParseClassUnion(const RegExpBuilder* builder, bool is_negated,
506 RegExpTree* first_operand,
507 ClassSetOperandType first_operand_type,
508 ZoneList<CharacterRange>* ranges,
509 CharacterClassStrings* strings,
510 base::uc32 first_character);
511 RegExpTree* ParseClassIntersection(const RegExpBuilder* builder,
512 bool is_negated, RegExpTree* first_operand,
513 ClassSetOperandType first_operand_type);
514 RegExpTree* ParseClassSubtraction(const RegExpBuilder* builder,
515 bool is_negated, RegExpTree* first_operand,
516 ClassSetOperandType first_operand_type);
517 RegExpTree* ParseCharacterClass(const RegExpBuilder* state);
518
519 base::uc32 ParseOctalLiteral();
520
521 // Tries to parse the input as a back reference. If successful it
522 // stores the result in the output parameter and returns true. If
523 // it fails it will push back the characters read so the same characters
524 // can be reparsed.
525 bool ParseBackReferenceIndex(int* index_out);
526
527 RegExpTree* ReportError(RegExpError error);
528 void Advance();
529 void Advance(int dist);
530 void RewindByOneCodepoint(); // Rewinds to before the previous Advance().
531 void Reset(int pos);
532
533 // Reports whether the pattern might be used as a literal search string.
534 // Only use if the result of the parse is a single atom node.
535 bool simple() const { return simple_; }
536 bool contains_anchor() const { return contains_anchor_; }
537 void set_contains_anchor() { contains_anchor_ = true; }
538 int captures_started() const { return captures_started_; }
539 int position() const {
540 const bool current_is_surrogate =
541 current() != kEndMarker &&
543 const int rewind_bytes = current_is_surrogate ? 2 : 1;
544 return next_pos_ - rewind_bytes;
545 }
546 bool failed() const { return failed_; }
547 RegExpFlags flags() const { return flags_; }
548 bool IsUnicodeMode() const {
549 // Either /v or /u enable UnicodeMode
550 // https://tc39.es/ecma262/#sec-parsepattern
551 return IsUnicode(flags()) || IsUnicodeSets(flags()) || force_unicode_;
552 }
553 bool unicode_sets() const { return IsUnicodeSets(flags()); }
554 bool ignore_case() const { return IsIgnoreCase(flags()); }
555
556 static bool IsSyntaxCharacterOrSlash(base::uc32 c);
557 static bool IsClassSetSyntaxCharacter(base::uc32 c);
558 static bool IsClassSetReservedPunctuator(base::uc32 c);
559 bool IsClassSetReservedDoublePunctuator(base::uc32 c);
560
561 static const base::uc32 kEndMarker = (1 << 21);
562
563 private:
564 // Return the 1-indexed RegExpCapture object, allocate if necessary.
565 RegExpCapture* GetCapture(int index);
566
567 // Creates a new named capture at the specified index. Must be called exactly
568 // once for each named capture. Fails if a capture with the same name is
569 // encountered.
570 bool CreateNamedCaptureAtIndex(const RegExpParserState* state, int index);
571
572 // Parses the name of a capture group (?<name>pattern). The name must adhere
573 // to IdentifierName in the ECMAScript standard.
574 const ZoneVector<base::uc16>* ParseCaptureGroupName();
575
576 bool ParseNamedBackReference(RegExpBuilder* builder,
577 RegExpParserState* state);
578 RegExpParserState* ParseOpenParenthesis(RegExpParserState* state);
579
580 // After the initial parsing pass, patch corresponding RegExpCapture objects
581 // into all RegExpBackReferences. This is done after initial parsing in order
582 // to avoid complicating cases in which references comes before the capture.
583 void PatchNamedBackReferences();
584
585 ZoneVector<RegExpCapture*>* GetNamedCaptures();
586
587 // Returns true iff the pattern contains named captures. May call
588 // ScanForCaptures to look ahead at the remaining pattern.
589 bool HasNamedCaptures(InClassEscapeState in_class_escape_state);
590
591 Zone* zone() const { return zone_; }
592
593 base::uc32 current() const { return current_; }
594 bool has_more() const { return has_more_; }
595 bool has_next() const { return next_pos_ < input_length(); }
596 base::uc32 Next();
597 template <bool update_position>
598 base::uc32 ReadNext();
599 CharT InputAt(int index) const {
600 DCHECK(0 <= index && index < input_length());
601 return input_[index];
602 }
603 int input_length() const { return input_length_; }
604 void ScanForCaptures(InClassEscapeState in_class_escape_state);
605
606 struct RegExpCaptureNameLess {
607 bool operator()(const RegExpCapture* lhs, const RegExpCapture* rhs) const {
608 DCHECK_NOT_NULL(lhs);
609 DCHECK_NOT_NULL(rhs);
610 return *lhs->name() < *rhs->name();
611 }
612 };
613
614 class ForceUnicodeScope final {
615 public:
616 explicit ForceUnicodeScope(RegExpParserImpl<CharT>* parser)
617 : parser_(parser) {
618 DCHECK(!parser_->force_unicode_);
619 parser_->force_unicode_ = true;
620 }
621 ~ForceUnicodeScope() {
622 DCHECK(parser_->force_unicode_);
623 parser_->force_unicode_ = false;
624 }
625
626 private:
627 RegExpParserImpl<CharT>* const parser_;
628 };
629
630 const DisallowGarbageCollection no_gc_;
631 Zone* const zone_;
632 RegExpError error_ = RegExpError::kNone;
633 int error_pos_ = 0;
634 ZoneList<RegExpCapture*>* captures_;
635 // Maps capture names to a list of capture indices with this name.
636 ZoneMap<RegExpCapture*, ZoneList<int>*, RegExpCaptureNameLess>*
638 ZoneList<RegExpBackReference*>* named_back_references_;
639 const CharT* const input_;
640 const int input_length_;
641 base::uc32 current_;
642 RegExpFlags flags_;
643 bool force_unicode_ = false; // Force parser to act as if unicode were set.
646 int capture_count_; // Only valid after we have scanned for captures.
648 int lookaround_count_; // Only valid after we have scanned for lookbehinds.
653 bool has_named_captures_; // Only valid after we have scanned for captures.
655 const uintptr_t stack_limit_;
656
657 friend class v8::internal::RegExpParser;
658};
659
660template <class CharT>
661RegExpParserImpl<CharT>::RegExpParserImpl(
662 const CharT* input, int input_length, RegExpFlags flags,
663 uintptr_t stack_limit, Zone* zone, const DisallowGarbageCollection& no_gc)
664 : zone_(zone),
665 captures_(nullptr),
666 named_captures_(nullptr),
667 named_back_references_(nullptr),
668 input_(input),
669 input_length_(input_length),
671 flags_(flags),
672 next_pos_(0),
677 has_more_(true),
678 simple_(false),
679 contains_anchor_(false),
681 has_named_captures_(false),
682 failed_(false),
683 stack_limit_(stack_limit) {
684 Advance();
685}
686
687template <>
688template <bool update_position>
689inline base::uc32 RegExpParserImpl<uint8_t>::ReadNext() {
690 int position = next_pos_;
691 base::uc16 c0 = InputAt(position);
692 position++;
694 if (update_position) next_pos_ = position;
695 return c0;
696}
697
698template <>
699template <bool update_position>
700inline base::uc32 RegExpParserImpl<base::uc16>::ReadNext() {
701 int position = next_pos_;
702 base::uc16 c0 = InputAt(position);
703 base::uc32 result = c0;
704 position++;
705 // Read the whole surrogate pair in case of unicode mode, if possible.
706 if (IsUnicodeMode() && position < input_length() &&
708 base::uc16 c1 = InputAt(position);
711 position++;
712 }
713 }
714 if (update_position) next_pos_ = position;
715 return result;
716}
717
718template <class CharT>
719base::uc32 RegExpParserImpl<CharT>::Next() {
720 if (has_next()) {
721 return ReadNext<false>();
722 } else {
723 return kEndMarker;
724 }
725}
726
727template <class CharT>
728void RegExpParserImpl<CharT>::Advance() {
729 if (has_next()) {
731 if (v8_flags.correctness_fuzzer_suppressions) {
732 FATAL("Aborting on stack overflow");
733 }
734 ReportError(RegExpError::kStackOverflow);
735 } else {
736 current_ = ReadNext<true>();
737 }
738 } else {
740 // Advance so that position() points to 1-after-the-last-character. This is
741 // important so that Reset() to this position works correctly.
742 next_pos_ = input_length() + 1;
743 has_more_ = false;
744 }
745}
746
747template <class CharT>
748void RegExpParserImpl<CharT>::RewindByOneCodepoint() {
749 if (!has_more()) return;
750 // Rewinds by one code point, i.e.: two code units if `current` is outside
751 // the basic multilingual plane (= composed of a lead and trail surrogate),
752 // or one code unit otherwise.
753 const int rewind_by =
755 Advance(rewind_by); // Undo the last Advance.
756}
757
758template <class CharT>
759void RegExpParserImpl<CharT>::Reset(int pos) {
760 next_pos_ = pos;
761 has_more_ = (pos < input_length());
762 Advance();
763}
764
765template <class CharT>
766void RegExpParserImpl<CharT>::Advance(int dist) {
767 next_pos_ += dist - 1;
768 Advance();
769}
770
771// static
772template <class CharT>
773bool RegExpParserImpl<CharT>::IsSyntaxCharacterOrSlash(base::uc32 c) {
774 switch (c) {
775 case '^':
776 case '$':
777 case '\\':
778 case '.':
779 case '*':
780 case '+':
781 case '?':
782 case '(':
783 case ')':
784 case '[':
785 case ']':
786 case '{':
787 case '}':
788 case '|':
789 case '/':
790 return true;
791 default:
792 break;
793 }
794 return false;
795}
796
797// static
798template <class CharT>
799bool RegExpParserImpl<CharT>::IsClassSetSyntaxCharacter(base::uc32 c) {
800 switch (c) {
801 case '(':
802 case ')':
803 case '[':
804 case ']':
805 case '{':
806 case '}':
807 case '/':
808 case '-':
809 case '\\':
810 case '|':
811 return true;
812 default:
813 break;
814 }
815 return false;
816}
817
818// static
819template <class CharT>
820bool RegExpParserImpl<CharT>::IsClassSetReservedPunctuator(base::uc32 c) {
821 switch (c) {
822 case '&':
823 case '-':
824 case '!':
825 case '#':
826 case '%':
827 case ',':
828 case ':':
829 case ';':
830 case '<':
831 case '=':
832 case '>':
833 case '@':
834 case '`':
835 case '~':
836 return true;
837 default:
838 break;
839 }
840 return false;
841}
842
843template <class CharT>
844bool RegExpParserImpl<CharT>::IsClassSetReservedDoublePunctuator(base::uc32 c) {
845#define DOUBLE_PUNCTUATOR_CASE(Char) \
846 case Char: \
847 return Next() == Char
848
849 switch (c) {
869 default:
870 break;
871 }
872#undef DOUBLE_PUNCTUATOR_CASE
873
874 return false;
875}
876
877template <class CharT>
878RegExpTree* RegExpParserImpl<CharT>::ReportError(RegExpError error) {
879 if (failed_) return nullptr; // Do not overwrite any existing error.
880 failed_ = true;
881 error_ = error;
883 // Zip to the end to make sure no more input is read.
885 next_pos_ = input_length();
886 has_more_ = false;
887 return nullptr;
888}
889
890#define CHECK_FAILED ); \
891 if (failed_) return nullptr; \
892 ((void)0
893
894// Pattern ::
895// Disjunction
896template <class CharT>
897RegExpTree* RegExpParserImpl<CharT>::ParsePattern() {
898 RegExpTree* result = ParseDisjunction(CHECK_FAILED);
899 PatchNamedBackReferences(CHECK_FAILED);
900 DCHECK(!has_more());
901 // If the result of parsing is a literal string atom, and it has the
902 // same length as the input, then the atom is identical to the input.
903 if (result->IsAtom() && result->AsAtom()->length() == input_length()) {
904 simple_ = true;
905 }
906 return result;
907}
908
909// Disjunction ::
910// Alternative
911// Alternative | Disjunction
912// Alternative ::
913// [empty]
914// Term Alternative
915// Term ::
916// Assertion
917// Atom
918// Atom Quantifier
919template <class CharT>
920RegExpTree* RegExpParserImpl<CharT>::ParseDisjunction() {
921 // Used to store current state while parsing subexpressions.
922 RegExpParserState initial_state(nullptr, INITIAL, RegExpLookaround::LOOKAHEAD,
923 0, nullptr, flags(), zone());
924 RegExpParserState* state = &initial_state;
925 // Cache the builder in a local variable for quick access.
926 RegExpBuilder* builder = initial_state.builder();
927 while (true) {
928 switch (current()) {
929 case kEndMarker:
930 if (failed()) return nullptr; // E.g. the initial Advance failed.
931 if (state->IsSubexpression()) {
932 // Inside a parenthesized group when hitting end of input.
933 return ReportError(RegExpError::kUnterminatedGroup);
934 }
935 DCHECK_EQ(INITIAL, state->group_type());
936 // Parsing completed successfully.
937 return builder->ToRegExp();
938 case ')': {
939 if (!state->IsSubexpression()) {
940 return ReportError(RegExpError::kUnmatchedParen);
941 }
942 DCHECK_NE(INITIAL, state->group_type());
943
944 Advance();
945 // End disjunction parsing and convert builder content to new single
946 // regexp atom.
947 RegExpTree* body = builder->ToRegExp();
948
949 int end_capture_index = captures_started();
950
951 int capture_index = state->capture_index();
952 SubexpressionType group_type = state->group_type();
953
954 // Build result of subexpression.
955 if (group_type == CAPTURE) {
956 if (state->IsNamedCapture()) {
957 CreateNamedCaptureAtIndex(state, capture_index CHECK_FAILED);
958 }
959 RegExpCapture* capture = GetCapture(capture_index);
960 capture->set_body(body);
961 body = capture;
962 } else if (group_type == GROUPING) {
963 body = zone()->template New<RegExpGroup>(body, builder->flags());
964 } else {
965 DCHECK(group_type == POSITIVE_LOOKAROUND ||
966 group_type == NEGATIVE_LOOKAROUND);
967 bool is_positive = (group_type == POSITIVE_LOOKAROUND);
968 body = zone()->template New<RegExpLookaround>(
969 body, is_positive, end_capture_index - capture_index,
970 capture_index, state->lookaround_type(), lookaround_count_);
972 }
973
974 // Restore previous state.
975 state = state->previous_state();
976 builder = state->builder();
977
978 builder->AddAtom(body);
979 // For compatibility with JSC and ES3, we allow quantifiers after
980 // lookaheads, and break in all cases.
981 break;
982 }
983 case '|': {
984 Advance();
985 state->NewAlternative(captures_started());
986 builder->NewAlternative();
987 continue;
988 }
989 case '*':
990 case '+':
991 case '?':
992 return ReportError(RegExpError::kNothingToRepeat);
993 case '^': {
994 Advance();
995 builder->AddAssertion(zone()->template New<RegExpAssertion>(
996 builder->multiline() ? RegExpAssertion::Type::START_OF_LINE
998 set_contains_anchor();
999 continue;
1000 }
1001 case '$': {
1002 Advance();
1003 RegExpAssertion::Type assertion_type =
1004 builder->multiline() ? RegExpAssertion::Type::END_OF_LINE
1006 builder->AddAssertion(
1007 zone()->template New<RegExpAssertion>(assertion_type));
1008 continue;
1009 }
1010 case '.': {
1011 Advance();
1012 ZoneList<CharacterRange>* ranges =
1013 zone()->template New<ZoneList<CharacterRange>>(2, zone());
1014
1015 if (builder->dotall()) {
1016 // Everything.
1018 ranges, false, zone());
1019 } else {
1020 // Everything except \x0A, \x0D, \u2028 and \u2029.
1022 StandardCharacterSet::kNotLineTerminator, ranges, false, zone());
1023 }
1024
1025 RegExpClassRanges* cc =
1026 zone()->template New<RegExpClassRanges>(zone(), ranges);
1027 builder->AddClassRanges(cc);
1028 break;
1029 }
1030 case '(': {
1031 state = ParseOpenParenthesis(state CHECK_FAILED);
1032 builder = state->builder();
1033 flags_ = builder->flags();
1034 continue;
1035 }
1036 case '[': {
1037 RegExpTree* cc = ParseCharacterClass(builder CHECK_FAILED);
1038 if (cc->IsClassRanges()) {
1039 builder->AddClassRanges(cc->AsClassRanges());
1040 } else {
1041 DCHECK(cc->IsClassSetExpression());
1042 builder->AddTerm(cc);
1043 }
1044 break;
1045 }
1046 // Atom ::
1047 // \ AtomEscape
1048 case '\\':
1049 switch (Next()) {
1050 case kEndMarker:
1051 return ReportError(RegExpError::kEscapeAtEndOfPattern);
1052 // AtomEscape ::
1053 // [+UnicodeMode] DecimalEscape
1054 // [~UnicodeMode] DecimalEscape but only if the CapturingGroupNumber
1055 // of DecimalEscape is ≤ NcapturingParens
1056 // CharacterEscape (some cases of this mixed in too)
1057 //
1058 // TODO(jgruber): It may make sense to disentangle all the different
1059 // cases and make the structure mirror the spec, e.g. for AtomEscape:
1060 //
1061 // if (TryParseDecimalEscape(...)) return;
1062 // if (TryParseCharacterClassEscape(...)) return;
1063 // if (TryParseCharacterEscape(...)) return;
1064 // if (TryParseGroupName(...)) return;
1065 case '1':
1066 case '2':
1067 case '3':
1068 case '4':
1069 case '5':
1070 case '6':
1071 case '7':
1072 case '8':
1073 case '9': {
1074 int index = 0;
1075 const bool is_backref =
1076 ParseBackReferenceIndex(&index CHECK_FAILED);
1077 if (is_backref) {
1078 if (state->IsInsideCaptureGroup(index)) {
1079 // The back reference is inside the capture group it refers to.
1080 // Nothing can possibly have been captured yet, so we use empty
1081 // instead. This ensures that, when checking a back reference,
1082 // the capture registers of the referenced capture are either
1083 // both set or both cleared.
1084 builder->AddEmpty();
1085 } else {
1086 RegExpCapture* capture = GetCapture(index);
1087 RegExpTree* atom =
1088 zone()->template New<RegExpBackReference>(capture, zone());
1089 builder->AddAtom(atom);
1090 }
1091 break;
1092 }
1093 // With /u and /v, no identity escapes except for syntax characters
1094 // are allowed. Otherwise, all identity escapes are allowed.
1095 if (IsUnicodeMode()) {
1096 return ReportError(RegExpError::kInvalidEscape);
1097 }
1098 base::uc32 first_digit = Next();
1099 if (first_digit == '8' || first_digit == '9') {
1100 builder->AddCharacter(first_digit);
1101 Advance(2);
1102 break;
1103 }
1104 [[fallthrough]];
1105 }
1106 case '0': {
1107 Advance();
1108 if (IsUnicodeMode() && Next() >= '0' && Next() <= '9') {
1109 // Decimal escape with leading 0 are not parsed as octal.
1110 return ReportError(RegExpError::kInvalidDecimalEscape);
1111 }
1112 base::uc32 octal = ParseOctalLiteral();
1113 builder->AddCharacter(octal);
1114 break;
1115 }
1116 case 'b':
1117 Advance(2);
1118 builder->AddAssertion(zone()->template New<RegExpAssertion>(
1120 continue;
1121 case 'B':
1122 Advance(2);
1123 builder->AddAssertion(zone()->template New<RegExpAssertion>(
1125 continue;
1126 // AtomEscape ::
1127 // CharacterClassEscape
1128 case 'd':
1129 case 'D':
1130 case 's':
1131 case 'S':
1132 case 'w':
1133 case 'W': {
1134 base::uc32 next = Next();
1135 ZoneList<CharacterRange>* ranges =
1136 zone()->template New<ZoneList<CharacterRange>>(2, zone());
1137 bool add_unicode_case_equivalents =
1138 IsUnicodeMode() && ignore_case();
1139 bool parsed_character_class_escape = TryParseCharacterClassEscape(
1140 next, InClassEscapeState::kNotInClass, ranges, nullptr, zone(),
1141 add_unicode_case_equivalents CHECK_FAILED);
1142
1143 if (parsed_character_class_escape) {
1144 RegExpClassRanges* cc =
1145 zone()->template New<RegExpClassRanges>(zone(), ranges);
1146 builder->AddClassRanges(cc);
1147 } else {
1148 CHECK(!IsUnicodeMode());
1149 Advance(2);
1150 builder->AddCharacter(next); // IdentityEscape.
1151 }
1152 break;
1153 }
1154 case 'p':
1155 case 'P': {
1156 base::uc32 next = Next();
1157 ZoneList<CharacterRange>* ranges =
1158 zone()->template New<ZoneList<CharacterRange>>(2, zone());
1159 CharacterClassStrings* strings = nullptr;
1160 if (unicode_sets()) {
1161 strings = zone()->template New<CharacterClassStrings>(zone());
1162 }
1163 bool add_unicode_case_equivalents = ignore_case();
1164 bool parsed_character_class_escape = TryParseCharacterClassEscape(
1165 next, InClassEscapeState::kNotInClass, ranges, strings, zone(),
1166 add_unicode_case_equivalents CHECK_FAILED);
1167
1168 if (parsed_character_class_escape) {
1169 if (unicode_sets()) {
1170 RegExpClassSetOperand* op =
1171 zone()->template New<RegExpClassSetOperand>(ranges,
1172 strings);
1173 builder->AddTerm(op);
1174 } else {
1175 RegExpClassRanges* cc =
1176 zone()->template New<RegExpClassRanges>(zone(), ranges);
1177 builder->AddClassRanges(cc);
1178 }
1179 } else {
1180 CHECK(!IsUnicodeMode());
1181 Advance(2);
1182 builder->AddCharacter(next); // IdentityEscape.
1183 }
1184 break;
1185 }
1186 // AtomEscape ::
1187 // k GroupName
1188 case 'k': {
1189 // Either an identity escape or a named back-reference. The two
1190 // interpretations are mutually exclusive: '\k' is interpreted as
1191 // an identity escape for non-Unicode patterns without named
1192 // capture groups, and as the beginning of a named back-reference
1193 // in all other cases.
1194 const bool has_named_captures =
1195 HasNamedCaptures(InClassEscapeState::kNotInClass CHECK_FAILED);
1196 if (IsUnicodeMode() || has_named_captures) {
1197 Advance(2);
1198 ParseNamedBackReference(builder, state CHECK_FAILED);
1199 break;
1200 }
1201 }
1202 [[fallthrough]];
1203 // AtomEscape ::
1204 // CharacterEscape
1205 default: {
1206 bool is_escaped_unicode_character = false;
1207 base::uc32 c = ParseCharacterEscape(
1208 InClassEscapeState::kNotInClass,
1209 &is_escaped_unicode_character CHECK_FAILED);
1210 if (is_escaped_unicode_character) {
1211 builder->AddEscapedUnicodeCharacter(c);
1212 } else {
1213 builder->AddCharacter(c);
1214 }
1215 break;
1216 }
1217 }
1218 break;
1219 case '{': {
1220 int dummy;
1221 bool parsed = ParseIntervalQuantifier(&dummy, &dummy CHECK_FAILED);
1222 if (parsed) return ReportError(RegExpError::kNothingToRepeat);
1223 [[fallthrough]];
1224 }
1225 case '}':
1226 case ']':
1227 if (IsUnicodeMode()) {
1228 return ReportError(RegExpError::kLoneQuantifierBrackets);
1229 }
1230 [[fallthrough]];
1231 default:
1232 builder->AddUnicodeCharacter(current());
1233 Advance();
1234 break;
1235 } // end switch(current())
1236
1237 int min;
1238 int max;
1239 switch (current()) {
1240 // QuantifierPrefix ::
1241 // *
1242 // +
1243 // ?
1244 // {
1245 case '*':
1246 min = 0;
1248 Advance();
1249 break;
1250 case '+':
1251 min = 1;
1253 Advance();
1254 break;
1255 case '?':
1256 min = 0;
1257 max = 1;
1258 Advance();
1259 break;
1260 case '{':
1261 if (ParseIntervalQuantifier(&min, &max)) {
1262 if (max < min) {
1263 return ReportError(RegExpError::kRangeOutOfOrder);
1264 }
1265 break;
1266 } else if (IsUnicodeMode()) {
1267 // Incomplete quantifiers are not allowed.
1268 return ReportError(RegExpError::kIncompleteQuantifier);
1269 }
1270 continue;
1271 default:
1272 continue;
1273 }
1275 if (current() == '?') {
1276 quantifier_type = RegExpQuantifier::NON_GREEDY;
1277 Advance();
1278 } else if (v8_flags.regexp_possessive_quantifier && current() == '+') {
1279 // v8_flags.regexp_possessive_quantifier is a debug-only flag.
1280 quantifier_type = RegExpQuantifier::POSSESSIVE;
1281 Advance();
1282 }
1283 if (!builder->AddQuantifierToAtom(min, max, quantifier_count_,
1284 quantifier_type)) {
1285 return ReportError(RegExpError::kInvalidQuantifier);
1286 }
1288 }
1289}
1290
1291template <class CharT>
1292RegExpParserState* RegExpParserImpl<CharT>::ParseOpenParenthesis(
1293 RegExpParserState* state) {
1294 RegExpLookaround::Type lookaround_type = state->lookaround_type();
1295 bool is_named_capture = false;
1296 const ZoneVector<base::uc16>* capture_name = nullptr;
1297 SubexpressionType subexpr_type = CAPTURE;
1298 RegExpFlags flags = state->builder()->flags();
1299 bool parsing_modifiers = false;
1300 bool modifiers_polarity = true;
1301 RegExpFlags modifiers;
1302 Advance();
1303 if (current() == '?') {
1304 do {
1305 base::uc32 next = Next();
1306 switch (next) {
1307 case '-':
1308 if (!v8_flags.js_regexp_modifiers) {
1309 ReportError(RegExpError::kInvalidGroup);
1310 return nullptr;
1311 }
1312 Advance();
1313 parsing_modifiers = true;
1314 if (modifiers_polarity == false) {
1315 ReportError(RegExpError::kMultipleFlagDashes);
1316 return nullptr;
1317 }
1318 modifiers_polarity = false;
1319 break;
1320 case 'm':
1321 case 'i':
1322 case 's': {
1323 if (!v8_flags.js_regexp_modifiers) {
1324 ReportError(RegExpError::kInvalidGroup);
1325 return nullptr;
1326 }
1327 Advance();
1328 parsing_modifiers = true;
1329 RegExpFlag flag = TryRegExpFlagFromChar(next).value();
1330 if ((modifiers & flag) != 0) {
1331 ReportError(RegExpError::kRepeatedFlag);
1332 return nullptr;
1333 }
1334 modifiers |= flag;
1335 flags.set(flag, modifiers_polarity);
1336 break;
1337 }
1338 case ':':
1339 Advance(2);
1340 parsing_modifiers = false;
1341 subexpr_type = GROUPING;
1342 break;
1343 case '=':
1344 Advance(2);
1345 if (parsing_modifiers) {
1346 DCHECK(v8_flags.js_regexp_modifiers);
1347 ReportError(RegExpError::kInvalidGroup);
1348 return nullptr;
1349 }
1350 lookaround_type = RegExpLookaround::LOOKAHEAD;
1351 subexpr_type = POSITIVE_LOOKAROUND;
1352 break;
1353 case '!':
1354 Advance(2);
1355 if (parsing_modifiers) {
1356 DCHECK(v8_flags.js_regexp_modifiers);
1357 ReportError(RegExpError::kInvalidGroup);
1358 return nullptr;
1359 }
1360 lookaround_type = RegExpLookaround::LOOKAHEAD;
1361 subexpr_type = NEGATIVE_LOOKAROUND;
1362 break;
1363 case '<':
1364 Advance();
1365 if (parsing_modifiers) {
1366 DCHECK(v8_flags.js_regexp_modifiers);
1367 ReportError(RegExpError::kInvalidGroup);
1368 return nullptr;
1369 }
1370 if (Next() == '=') {
1371 Advance(2);
1372 lookaround_type = RegExpLookaround::LOOKBEHIND;
1373 subexpr_type = POSITIVE_LOOKAROUND;
1374 break;
1375 } else if (Next() == '!') {
1376 Advance(2);
1377 lookaround_type = RegExpLookaround::LOOKBEHIND;
1378 subexpr_type = NEGATIVE_LOOKAROUND;
1379 break;
1380 }
1381 is_named_capture = true;
1382 has_named_captures_ = true;
1383 Advance();
1384 break;
1385 default:
1386 ReportError(RegExpError::kInvalidGroup);
1387 return nullptr;
1388 }
1389 } while (parsing_modifiers);
1390 }
1391 if (modifiers_polarity == false) {
1392 // We encountered a dash.
1393 if (modifiers == 0) {
1394 ReportError(RegExpError::kInvalidFlagGroup);
1395 return nullptr;
1396 }
1397 }
1398 if (subexpr_type == CAPTURE) {
1400 ReportError(RegExpError::kTooManyCaptures);
1401 return nullptr;
1402 }
1404
1405 if (is_named_capture) {
1406 capture_name = ParseCaptureGroupName(CHECK_FAILED);
1407 }
1408 }
1409 // Store current state and begin new disjunction parsing.
1410 return zone()->template New<RegExpParserState>(
1411 state, subexpr_type, lookaround_type, captures_started_, capture_name,
1412 flags, zone());
1413}
1414
1415// In order to know whether an escape is a backreference or not we have to scan
1416// the entire regexp and find the number of capturing parentheses. However we
1417// don't want to scan the regexp twice unless it is necessary. This mini-parser
1418// is called when needed. It can see the difference between capturing and
1419// noncapturing parentheses and can skip character classes and backslash-escaped
1420// characters.
1421//
1422// Important: The scanner has to be in a consistent state when calling
1423// ScanForCaptures, e.g. not in the middle of an escape sequence '\[' or while
1424// parsing a nested class.
1425template <class CharT>
1426void RegExpParserImpl<CharT>::ScanForCaptures(
1427 InClassEscapeState in_class_escape_state) {
1429 const int saved_position = position();
1430 // Start with captures started previous to current position
1431 int capture_count = captures_started();
1432 // When we start inside a character class, skip everything inside the class.
1433 if (in_class_escape_state == InClassEscapeState::kInClass) {
1434 // \k is always invalid within a class in unicode mode, thus we should never
1435 // call ScanForCaptures within a class.
1436 DCHECK(!IsUnicodeMode());
1437 int c;
1438 while ((c = current()) != kEndMarker) {
1439 Advance();
1440 if (c == '\\') {
1441 Advance();
1442 } else {
1443 if (c == ']') break;
1444 }
1445 }
1446 }
1447 // Add count of captures after this position.
1448 int n;
1449 while ((n = current()) != kEndMarker) {
1450 Advance();
1451 switch (n) {
1452 case '\\':
1453 Advance();
1454 break;
1455 case '[': {
1456 int class_nest_level = 0;
1457 int c;
1458 while ((c = current()) != kEndMarker) {
1459 Advance();
1460 if (c == '\\') {
1461 Advance();
1462 } else if (c == '[') {
1463 // With /v, '[' inside a class is treated as a nested class.
1464 // Without /v, '[' is a normal character.
1465 if (unicode_sets()) class_nest_level++;
1466 } else if (c == ']') {
1467 if (class_nest_level == 0) break;
1468 class_nest_level--;
1469 }
1470 }
1471 break;
1472 }
1473 case '(':
1474 if (current() == '?') {
1475 // At this point we could be in
1476 // * a non-capturing group '(:',
1477 // * a lookbehind assertion '(?<=' '(?<!'
1478 // * or a named capture '(?<'.
1479 //
1480 // Of these, only named captures are capturing groups.
1481
1482 Advance();
1483 if (current() != '<') break;
1484
1485 Advance();
1486 if (current() == '=' || current() == '!') break;
1487
1488 // Found a possible named capture. It could turn out to be a syntax
1489 // error (e.g. an unterminated or invalid name), but that distinction
1490 // does not matter for our purposes.
1491 has_named_captures_ = true;
1492 }
1493 capture_count++;
1494 break;
1495 }
1496 }
1497 capture_count_ = capture_count;
1499 Reset(saved_position);
1500}
1501
1502template <class CharT>
1503bool RegExpParserImpl<CharT>::ParseBackReferenceIndex(int* index_out) {
1504 DCHECK_EQ('\\', current());
1505 DCHECK('1' <= Next() && Next() <= '9');
1506 // Try to parse a decimal literal that is no greater than the total number
1507 // of left capturing parentheses in the input.
1508 int start = position();
1509 int value = Next() - '0';
1510 Advance(2);
1511 while (true) {
1512 base::uc32 c = current();
1513 if (IsDecimalDigit(c)) {
1514 value = 10 * value + (c - '0');
1516 Reset(start);
1517 return false;
1518 }
1519 Advance();
1520 } else {
1521 break;
1522 }
1523 }
1524 if (value > captures_started()) {
1526 ScanForCaptures(InClassEscapeState::kNotInClass);
1527 }
1528 if (value > capture_count_) {
1529 Reset(start);
1530 return false;
1531 }
1532 }
1533 *index_out = value;
1534 return true;
1535}
1536
1537namespace {
1538
1539void push_code_unit(ZoneVector<base::uc16>* v, uint32_t code_unit) {
1541 v->push_back(code_unit);
1542 } else {
1543 v->push_back(unibrow::Utf16::LeadSurrogate(code_unit));
1544 v->push_back(unibrow::Utf16::TrailSurrogate(code_unit));
1545 }
1546}
1547
1548} // namespace
1549
1550template <class CharT>
1551const ZoneVector<base::uc16>* RegExpParserImpl<CharT>::ParseCaptureGroupName() {
1552 // Due to special Advance requirements (see the next comment), rewind by one
1553 // such that names starting with a surrogate pair are parsed correctly for
1554 // patterns where the unicode flag is unset.
1555 //
1556 // Note that we use this odd pattern of rewinding the last advance in order
1557 // to adhere to the common parser behavior of expecting `current` to point at
1558 // the first candidate character for a function (e.g. when entering ParseFoo,
1559 // `current` should point at the first character of Foo).
1560 RewindByOneCodepoint();
1561
1562 ZoneVector<base::uc16>* name =
1563 zone()->template New<ZoneVector<base::uc16>>(zone());
1564
1565 {
1566 // Advance behavior inside this function is tricky since
1567 // RegExpIdentifierName explicitly enables unicode (in spec terms, sets +U)
1568 // and thus allows surrogate pairs and \u{}-style escapes even in
1569 // non-unicode patterns. Therefore Advance within the capture group name
1570 // has to force-enable unicode, and outside the name revert to default
1571 // behavior.
1572 ForceUnicodeScope force_unicode(this);
1573
1574 bool at_start = true;
1575 while (true) {
1576 Advance();
1577 base::uc32 c = current();
1578
1579 // Convert unicode escapes.
1580 if (c == '\\' && Next() == 'u') {
1581 Advance(2);
1582 if (!ParseUnicodeEscape(&c)) {
1583 ReportError(RegExpError::kInvalidUnicodeEscape);
1584 return nullptr;
1585 }
1586 RewindByOneCodepoint();
1587 }
1588
1589 // The backslash char is misclassified as both ID_Start and ID_Continue.
1590 if (c == '\\') {
1591 ReportError(RegExpError::kInvalidCaptureGroupName);
1592 return nullptr;
1593 }
1594
1595 if (at_start) {
1596 if (!IsIdentifierStart(c)) {
1597 ReportError(RegExpError::kInvalidCaptureGroupName);
1598 return nullptr;
1599 }
1600 push_code_unit(name, c);
1601 at_start = false;
1602 } else {
1603 if (c == '>') {
1604 break;
1605 } else if (IsIdentifierPart(c)) {
1606 push_code_unit(name, c);
1607 } else {
1608 ReportError(RegExpError::kInvalidCaptureGroupName);
1609 return nullptr;
1610 }
1611 }
1612 }
1613 }
1614
1615 // This final advance goes back into the state of pointing at the next
1616 // relevant char, which the rest of the parser expects. See also the previous
1617 // comments in this function.
1618 Advance();
1619 return name;
1620}
1621
1622template <class CharT>
1623bool RegExpParserImpl<CharT>::CreateNamedCaptureAtIndex(
1624 const RegExpParserState* state, int index) {
1625 const ZoneVector<base::uc16>* name = state->capture_name();
1626 const std::pair<int, int> non_participating_capture_group_interval =
1627 state->non_participating_capture_group_interval();
1628 DCHECK(0 < index && index <= captures_started_);
1629 DCHECK_NOT_NULL(name);
1630
1631 RegExpCapture* capture = GetCapture(index);
1632 DCHECK_NULL(capture->name());
1633
1634 capture->set_name(name);
1635
1636 if (named_captures_ == nullptr) {
1637 named_captures_ = zone_->template New<
1638 ZoneMap<RegExpCapture*, ZoneList<int>*, RegExpCaptureNameLess>>(zone());
1639 } else {
1640 // Check for duplicates and bail if we find any.
1641 const auto& named_capture_it = named_captures_->find(capture);
1642 if (named_capture_it != named_captures_->end()) {
1643 if (v8_flags.js_regexp_duplicate_named_groups) {
1644 ZoneList<int>* named_capture_indices = named_capture_it->second;
1645 DCHECK_NOT_NULL(named_capture_indices);
1646 DCHECK(!named_capture_indices->is_empty());
1647 for (int named_index : *named_capture_indices) {
1648 if (named_index < non_participating_capture_group_interval.first ||
1649 named_index > non_participating_capture_group_interval.second) {
1650 ReportError(RegExpError::kDuplicateCaptureGroupName);
1651 return false;
1652 }
1653 }
1654 } else {
1655 ReportError(RegExpError::kDuplicateCaptureGroupName);
1656 return false;
1657 }
1658 }
1659 }
1660
1661 auto entry = named_captures_->try_emplace(
1662 capture, zone()->template New<ZoneList<int>>(1, zone()));
1663 entry.first->second->Add(index, zone());
1664 return true;
1665}
1666
1667template <class CharT>
1668bool RegExpParserImpl<CharT>::ParseNamedBackReference(
1669 RegExpBuilder* builder, RegExpParserState* state) {
1670 // The parser is assumed to be on the '<' in \k<name>.
1671 if (current() != '<') {
1672 ReportError(RegExpError::kInvalidNamedReference);
1673 return false;
1674 }
1675
1676 Advance();
1677 const ZoneVector<base::uc16>* name = ParseCaptureGroupName();
1678 if (name == nullptr) {
1679 return false;
1680 }
1681
1682 if (state->IsInsideCaptureGroup(name)) {
1683 builder->AddEmpty();
1684 } else {
1685 RegExpBackReference* atom =
1686 zone()->template New<RegExpBackReference>(zone());
1687 atom->set_name(name);
1688
1689 builder->AddAtom(atom);
1690
1691 if (named_back_references_ == nullptr) {
1693 zone()->template New<ZoneList<RegExpBackReference*>>(1, zone());
1694 }
1695 named_back_references_->Add(atom, zone());
1696 }
1697
1698 return true;
1699}
1700
1701template <class CharT>
1702void RegExpParserImpl<CharT>::PatchNamedBackReferences() {
1703 if (named_back_references_ == nullptr) return;
1704
1705 if (named_captures_ == nullptr) {
1706 ReportError(RegExpError::kInvalidNamedCaptureReference);
1707 return;
1708 }
1709
1710 // Look up and patch the actual capture for each named back reference.
1711
1712 for (int i = 0; i < named_back_references_->length(); i++) {
1713 RegExpBackReference* ref = named_back_references_->at(i);
1714
1715 // Capture used to search the named_captures_ by name, index of the
1716 // capture is never used.
1717 static const int kInvalidIndex = 0;
1718 RegExpCapture* search_capture =
1719 zone()->template New<RegExpCapture>(kInvalidIndex);
1720 DCHECK_NULL(search_capture->name());
1721 search_capture->set_name(ref->name());
1722
1723 const auto& capture_it = named_captures_->find(search_capture);
1724 if (capture_it == named_captures_->end()) {
1725 ReportError(RegExpError::kInvalidNamedCaptureReference);
1726 return;
1727 }
1728
1729 DCHECK_IMPLIES(!v8_flags.js_regexp_duplicate_named_groups,
1730 capture_it->second->length() == 1);
1731 for (int index : *capture_it->second) {
1732 ref->add_capture(GetCapture(index), zone());
1733 }
1734 }
1735}
1736
1737template <class CharT>
1738RegExpCapture* RegExpParserImpl<CharT>::GetCapture(int index) {
1739 // The index for the capture groups are one-based. Its index in the list is
1740 // zero-based.
1741 const int known_captures =
1743 SBXCHECK(index >= 1 && index <= known_captures);
1744 if (captures_ == nullptr) {
1745 captures_ =
1746 zone()->template New<ZoneList<RegExpCapture*>>(known_captures, zone());
1747 }
1748 while (captures_->length() < known_captures) {
1749 captures_->Add(zone()->template New<RegExpCapture>(captures_->length() + 1),
1750 zone());
1751 }
1752 return captures_->at(index - 1);
1753}
1754
1755template <class CharT>
1756ZoneVector<RegExpCapture*>* RegExpParserImpl<CharT>::GetNamedCaptures() {
1757 if (named_captures_ == nullptr) {
1758 return nullptr;
1759 }
1760 DCHECK(!named_captures_->empty());
1761
1762 ZoneVector<RegExpCapture*>* flattened_named_captures =
1763 zone()->template New<ZoneVector<RegExpCapture*>>(zone());
1764 for (auto capture : *named_captures_) {
1765 DCHECK_IMPLIES(!v8_flags.js_regexp_duplicate_named_groups,
1766 capture.second->length() == 1);
1767 for (int index : *capture.second) {
1768 flattened_named_captures->push_back(GetCapture(index));
1769 }
1770 }
1771 return flattened_named_captures;
1772}
1773
1774template <class CharT>
1775bool RegExpParserImpl<CharT>::HasNamedCaptures(
1776 InClassEscapeState in_class_escape_state) {
1778 return has_named_captures_;
1779 }
1780
1781 ScanForCaptures(in_class_escape_state);
1783 return has_named_captures_;
1784}
1785
1786// QuantifierPrefix ::
1787// { DecimalDigits }
1788// { DecimalDigits , }
1789// { DecimalDigits , DecimalDigits }
1790//
1791// Returns true if parsing succeeds, and set the min_out and max_out
1792// values. Values are truncated to RegExpTree::kInfinity if they overflow.
1793template <class CharT>
1794bool RegExpParserImpl<CharT>::ParseIntervalQuantifier(int* min_out,
1795 int* max_out) {
1796 DCHECK_EQ(current(), '{');
1797 int start = position();
1798 Advance();
1799 int min = 0;
1800 if (!IsDecimalDigit(current())) {
1801 Reset(start);
1802 return false;
1803 }
1804 while (IsDecimalDigit(current())) {
1805 int next = current() - '0';
1806 if (min > (RegExpTree::kInfinity - next) / 10) {
1807 // Overflow. Skip past remaining decimal digits and return -1.
1808 do {
1809 Advance();
1810 } while (IsDecimalDigit(current()));
1811 min = RegExpTree::kInfinity;
1812 break;
1813 }
1814 min = 10 * min + next;
1815 Advance();
1816 }
1817 int max = 0;
1818 if (current() == '}') {
1819 max = min;
1820 Advance();
1821 } else if (current() == ',') {
1822 Advance();
1823 if (current() == '}') {
1824 max = RegExpTree::kInfinity;
1825 Advance();
1826 } else {
1827 while (IsDecimalDigit(current())) {
1828 int next = current() - '0';
1829 if (max > (RegExpTree::kInfinity - next) / 10) {
1830 do {
1831 Advance();
1832 } while (IsDecimalDigit(current()));
1833 max = RegExpTree::kInfinity;
1834 break;
1835 }
1836 max = 10 * max + next;
1837 Advance();
1838 }
1839 if (current() != '}') {
1840 Reset(start);
1841 return false;
1842 }
1843 Advance();
1844 }
1845 } else {
1846 Reset(start);
1847 return false;
1848 }
1849 *min_out = min;
1850 *max_out = max;
1851 return true;
1852}
1853
1854template <class CharT>
1855base::uc32 RegExpParserImpl<CharT>::ParseOctalLiteral() {
1856 DCHECK(('0' <= current() && current() <= '7') || !has_more());
1857 // For compatibility with some other browsers (not all), we parse
1858 // up to three octal digits with a value below 256.
1859 // ES#prod-annexB-LegacyOctalEscapeSequence
1860 base::uc32 value = current() - '0';
1861 Advance();
1862 if ('0' <= current() && current() <= '7') {
1863 value = value * 8 + current() - '0';
1864 Advance();
1865 if (value < 32 && '0' <= current() && current() <= '7') {
1866 value = value * 8 + current() - '0';
1867 Advance();
1868 }
1869 }
1870 return value;
1871}
1872
1873template <class CharT>
1874bool RegExpParserImpl<CharT>::ParseHexEscape(int length, base::uc32* value) {
1875 int start = position();
1876 base::uc32 val = 0;
1877 for (int i = 0; i < length; ++i) {
1878 base::uc32 c = current();
1879 int d = base::HexValue(c);
1880 if (d < 0) {
1881 Reset(start);
1882 return false;
1883 }
1884 val = val * 16 + d;
1885 Advance();
1886 }
1887 *value = val;
1888 return true;
1889}
1890
1891// This parses RegExpUnicodeEscapeSequence as described in ECMA262.
1892template <class CharT>
1893bool RegExpParserImpl<CharT>::ParseUnicodeEscape(base::uc32* value) {
1894 // Accept both \uxxxx and \u{xxxxxx} (if harmony unicode escapes are
1895 // allowed). In the latter case, the number of hex digits between { } is
1896 // arbitrary. \ and u have already been read.
1897 if (current() == '{' && IsUnicodeMode()) {
1898 int start = position();
1899 Advance();
1900 if (ParseUnlimitedLengthHexNumber(0x10FFFF, value)) {
1901 if (current() == '}') {
1902 Advance();
1903 return true;
1904 }
1905 }
1906 Reset(start);
1907 return false;
1908 }
1909 // \u but no {, or \u{...} escapes not allowed.
1910 bool result = ParseHexEscape(4, value);
1911 if (result && IsUnicodeMode() && unibrow::Utf16::IsLeadSurrogate(*value) &&
1912 current() == '\\') {
1913 // Attempt to read trail surrogate.
1914 int start = position();
1915 if (Next() == 'u') {
1916 Advance(2);
1917 base::uc32 trail;
1918 if (ParseHexEscape(4, &trail) &&
1921 static_cast<base::uc16>(*value), static_cast<base::uc16>(trail));
1922 return true;
1923 }
1924 }
1925 Reset(start);
1926 }
1927 return result;
1928}
1929
1930#ifdef V8_INTL_SUPPORT
1931
1932namespace {
1933
1934bool IsExactPropertyAlias(const char* property_name, UProperty property) {
1935 const char* short_name = u_getPropertyName(property, U_SHORT_PROPERTY_NAME);
1936 if (short_name != nullptr && strcmp(property_name, short_name) == 0)
1937 return true;
1938 for (int i = 0;; i++) {
1939 const char* long_name = u_getPropertyName(
1940 property, static_cast<UPropertyNameChoice>(U_LONG_PROPERTY_NAME + i));
1941 if (long_name == nullptr) break;
1942 if (strcmp(property_name, long_name) == 0) return true;
1943 }
1944 return false;
1945}
1946
1947bool IsExactPropertyValueAlias(const char* property_value_name,
1948 UProperty property, int32_t property_value) {
1949 const char* short_name =
1950 u_getPropertyValueName(property, property_value, U_SHORT_PROPERTY_NAME);
1951 if (short_name != nullptr && strcmp(property_value_name, short_name) == 0) {
1952 return true;
1953 }
1954 for (int i = 0;; i++) {
1955 const char* long_name = u_getPropertyValueName(
1956 property, property_value,
1957 static_cast<UPropertyNameChoice>(U_LONG_PROPERTY_NAME + i));
1958 if (long_name == nullptr) break;
1959 if (strcmp(property_value_name, long_name) == 0) return true;
1960 }
1961 return false;
1962}
1963
1964void ExtractStringsFromUnicodeSet(const icu::UnicodeSet& set,
1965 CharacterClassStrings* strings,
1966 RegExpFlags flags, Zone* zone) {
1967 DCHECK(set.hasStrings());
1968 DCHECK(IsUnicodeSets(flags));
1969 DCHECK_NOT_NULL(strings);
1970
1971 RegExpTextBuilder::SmallRegExpTreeVector string_storage(zone);
1972 RegExpTextBuilder string_builder(zone, &string_storage, flags);
1973 const bool needs_case_folding = IsIgnoreCase(flags);
1974 icu::UnicodeSetIterator iter(set);
1975 iter.skipToStrings();
1976 while (iter.next()) {
1977 const icu::UnicodeString& s = iter.getString();
1978 const char16_t* p = s.getBuffer();
1979 int32_t length = s.length();
1980 ZoneList<base::uc32>* string =
1981 zone->template New<ZoneList<base::uc32>>(length, zone);
1982 for (int32_t i = 0; i < length;) {
1983 UChar32 c;
1984 U16_NEXT(p, i, length, c);
1985 string_builder.AddUnicodeCharacter(c);
1986 if (needs_case_folding) {
1987 c = u_foldCase(c, U_FOLD_CASE_DEFAULT);
1988 }
1989 string->Add(c, zone);
1990 }
1991 strings->emplace(string->ToVector(), string_builder.ToRegExp());
1992 string_storage.clear();
1993 }
1994}
1995
1996bool LookupPropertyValueName(UProperty property,
1997 const char* property_value_name, bool negate,
1998 ZoneList<CharacterRange>* result_ranges,
1999 CharacterClassStrings* result_strings,
2000 RegExpFlags flags, Zone* zone) {
2001 UProperty property_for_lookup = property;
2002 if (property_for_lookup == UCHAR_SCRIPT_EXTENSIONS) {
2003 // For the property Script_Extensions, we have to do the property value
2004 // name lookup as if the property is Script.
2005 property_for_lookup = UCHAR_SCRIPT;
2006 }
2007 int32_t property_value =
2008 u_getPropertyValueEnum(property_for_lookup, property_value_name);
2009 if (property_value == UCHAR_INVALID_CODE) return false;
2010
2011 // We require the property name to match exactly to one of the property value
2012 // aliases. However, u_getPropertyValueEnum uses loose matching.
2013 if (!IsExactPropertyValueAlias(property_value_name, property_for_lookup,
2014 property_value)) {
2015 return false;
2016 }
2017
2018 UErrorCode ec = U_ZERO_ERROR;
2019 icu::UnicodeSet set;
2020 set.applyIntPropertyValue(property, property_value, ec);
2021 bool success = ec == U_ZERO_ERROR && !set.isEmpty();
2022
2023 if (success) {
2024 if (set.hasStrings()) {
2025 ExtractStringsFromUnicodeSet(set, result_strings, flags, zone);
2026 }
2027 const bool needs_case_folding = IsUnicodeSets(flags) && IsIgnoreCase(flags);
2028 if (needs_case_folding) set.closeOver(USET_SIMPLE_CASE_INSENSITIVE);
2029 set.removeAllStrings();
2030 if (negate) set.complement();
2031 for (int i = 0; i < set.getRangeCount(); i++) {
2032 result_ranges->Add(
2033 CharacterRange::Range(set.getRangeStart(i), set.getRangeEnd(i)),
2034 zone);
2035 }
2036 }
2037 return success;
2038}
2039
2040template <size_t N>
2041inline bool NameEquals(const char* name, const char (&literal)[N]) {
2042 return strncmp(name, literal, N + 1) == 0;
2043}
2044
2045bool LookupSpecialPropertyValueName(const char* name,
2046 ZoneList<CharacterRange>* result,
2047 bool negate, RegExpFlags flags,
2048 Zone* zone) {
2049 if (NameEquals(name, "Any")) {
2050 if (negate) {
2051 // Leave the list of character ranges empty, since the negation of 'Any'
2052 // is the empty set.
2053 } else {
2054 result->Add(CharacterRange::Everything(), zone);
2055 }
2056 } else if (NameEquals(name, "ASCII")) {
2057 result->Add(negate ? CharacterRange::Range(0x80, String::kMaxCodePoint)
2058 : CharacterRange::Range(0x0, 0x7F),
2059 zone);
2060 } else if (NameEquals(name, "Assigned")) {
2061 return LookupPropertyValueName(UCHAR_GENERAL_CATEGORY, "Unassigned",
2062 !negate, result, nullptr, flags, zone);
2063 } else {
2064 return false;
2065 }
2066 return true;
2067}
2068
2069// Explicitly allowlist supported binary properties. The spec forbids supporting
2070// properties outside of this set to ensure interoperability.
2071bool IsSupportedBinaryProperty(UProperty property, bool unicode_sets) {
2072 switch (property) {
2073 case UCHAR_ALPHABETIC:
2074 // 'Any' is not supported by ICU. See LookupSpecialPropertyValueName.
2075 // 'ASCII' is not supported by ICU. See LookupSpecialPropertyValueName.
2076 case UCHAR_ASCII_HEX_DIGIT:
2077 // 'Assigned' is not supported by ICU. See LookupSpecialPropertyValueName.
2078 case UCHAR_BIDI_CONTROL:
2079 case UCHAR_BIDI_MIRRORED:
2080 case UCHAR_CASE_IGNORABLE:
2081 case UCHAR_CASED:
2082 case UCHAR_CHANGES_WHEN_CASEFOLDED:
2083 case UCHAR_CHANGES_WHEN_CASEMAPPED:
2084 case UCHAR_CHANGES_WHEN_LOWERCASED:
2085 case UCHAR_CHANGES_WHEN_NFKC_CASEFOLDED:
2086 case UCHAR_CHANGES_WHEN_TITLECASED:
2087 case UCHAR_CHANGES_WHEN_UPPERCASED:
2088 case UCHAR_DASH:
2089 case UCHAR_DEFAULT_IGNORABLE_CODE_POINT:
2090 case UCHAR_DEPRECATED:
2091 case UCHAR_DIACRITIC:
2092 case UCHAR_EMOJI:
2093 case UCHAR_EMOJI_COMPONENT:
2094 case UCHAR_EMOJI_MODIFIER_BASE:
2095 case UCHAR_EMOJI_MODIFIER:
2096 case UCHAR_EMOJI_PRESENTATION:
2097 case UCHAR_EXTENDED_PICTOGRAPHIC:
2098 case UCHAR_EXTENDER:
2099 case UCHAR_GRAPHEME_BASE:
2100 case UCHAR_GRAPHEME_EXTEND:
2101 case UCHAR_HEX_DIGIT:
2102 case UCHAR_ID_CONTINUE:
2103 case UCHAR_ID_START:
2104 case UCHAR_IDEOGRAPHIC:
2105 case UCHAR_IDS_BINARY_OPERATOR:
2106 case UCHAR_IDS_TRINARY_OPERATOR:
2107 case UCHAR_JOIN_CONTROL:
2108 case UCHAR_LOGICAL_ORDER_EXCEPTION:
2109 case UCHAR_LOWERCASE:
2110 case UCHAR_MATH:
2111 case UCHAR_NONCHARACTER_CODE_POINT:
2112 case UCHAR_PATTERN_SYNTAX:
2113 case UCHAR_PATTERN_WHITE_SPACE:
2114 case UCHAR_QUOTATION_MARK:
2115 case UCHAR_RADICAL:
2116 case UCHAR_REGIONAL_INDICATOR:
2117 case UCHAR_S_TERM:
2118 case UCHAR_SOFT_DOTTED:
2119 case UCHAR_TERMINAL_PUNCTUATION:
2120 case UCHAR_UNIFIED_IDEOGRAPH:
2121 case UCHAR_UPPERCASE:
2122 case UCHAR_VARIATION_SELECTOR:
2123 case UCHAR_WHITE_SPACE:
2124 case UCHAR_XID_CONTINUE:
2125 case UCHAR_XID_START:
2126 return true;
2127 case UCHAR_BASIC_EMOJI:
2128 case UCHAR_EMOJI_KEYCAP_SEQUENCE:
2129 case UCHAR_RGI_EMOJI_MODIFIER_SEQUENCE:
2130 case UCHAR_RGI_EMOJI_FLAG_SEQUENCE:
2131 case UCHAR_RGI_EMOJI_TAG_SEQUENCE:
2132 case UCHAR_RGI_EMOJI_ZWJ_SEQUENCE:
2133 case UCHAR_RGI_EMOJI:
2134 return unicode_sets;
2135 default:
2136 break;
2137 }
2138 return false;
2139}
2140
2141bool IsBinaryPropertyOfStrings(UProperty property) {
2142 switch (property) {
2143 case UCHAR_BASIC_EMOJI:
2144 case UCHAR_EMOJI_KEYCAP_SEQUENCE:
2145 case UCHAR_RGI_EMOJI_MODIFIER_SEQUENCE:
2146 case UCHAR_RGI_EMOJI_FLAG_SEQUENCE:
2147 case UCHAR_RGI_EMOJI_TAG_SEQUENCE:
2148 case UCHAR_RGI_EMOJI_ZWJ_SEQUENCE:
2149 case UCHAR_RGI_EMOJI:
2150 return true;
2151 default:
2152 break;
2153 }
2154 return false;
2155}
2156
2157bool IsUnicodePropertyValueCharacter(char c) {
2158 // https://tc39.github.io/proposal-regexp-unicode-property-escapes/
2159 //
2160 // Note that using this to validate each parsed char is quite conservative.
2161 // A possible alternative solution would be to only ensure the parsed
2162 // property name/value candidate string does not contain '\0' characters and
2163 // let ICU lookups trigger the final failure.
2164 if ('a' <= c && c <= 'z') return true;
2165 if ('A' <= c && c <= 'Z') return true;
2166 if ('0' <= c && c <= '9') return true;
2167 return (c == '_');
2168}
2169
2170} // namespace
2171
2172template <class CharT>
2173bool RegExpParserImpl<CharT>::ParsePropertyClassName(ZoneVector<char>* name_1,
2174 ZoneVector<char>* name_2) {
2175 DCHECK(name_1->empty());
2176 DCHECK(name_2->empty());
2177 // Parse the property class as follows:
2178 // - In \p{name}, 'name' is interpreted
2179 // - either as a general category property value name.
2180 // - or as a binary property name.
2181 // - In \p{name=value}, 'name' is interpreted as an enumerated property name,
2182 // and 'value' is interpreted as one of the available property value names.
2183 // - Aliases in PropertyAlias.txt and PropertyValueAlias.txt can be used.
2184 // - Loose matching is not applied.
2185 if (current() == '{') {
2186 // Parse \p{[PropertyName=]PropertyNameValue}
2187 for (Advance(); current() != '}' && current() != '='; Advance()) {
2188 if (!IsUnicodePropertyValueCharacter(current())) return false;
2189 if (!has_next()) return false;
2190 name_1->push_back(static_cast<char>(current()));
2191 }
2192 if (current() == '=') {
2193 for (Advance(); current() != '}'; Advance()) {
2194 if (!IsUnicodePropertyValueCharacter(current())) return false;
2195 if (!has_next()) return false;
2196 name_2->push_back(static_cast<char>(current()));
2197 }
2198 name_2->push_back(0); // null-terminate string.
2199 }
2200 } else {
2201 return false;
2202 }
2203 Advance();
2204 name_1->push_back(0); // null-terminate string.
2205
2206 DCHECK(name_1->size() - 1 == std::strlen(name_1->data()));
2207 DCHECK(name_2->empty() || name_2->size() - 1 == std::strlen(name_2->data()));
2208 return true;
2209}
2210
2211template <class CharT>
2212bool RegExpParserImpl<CharT>::AddPropertyClassRange(
2213 ZoneList<CharacterRange>* add_to_ranges,
2214 CharacterClassStrings* add_to_strings, bool negate,
2215 const ZoneVector<char>& name_1, const ZoneVector<char>& name_2) {
2216 if (name_2.empty()) {
2217 // First attempt to interpret as general category property value name.
2218 const char* name = name_1.data();
2219 if (LookupPropertyValueName(UCHAR_GENERAL_CATEGORY_MASK, name, negate,
2220 add_to_ranges, add_to_strings, flags(),
2221 zone())) {
2222 return true;
2223 }
2224 // Interpret "Any", "ASCII", and "Assigned".
2225 if (LookupSpecialPropertyValueName(name, add_to_ranges, negate, flags(),
2226 zone())) {
2227 return true;
2228 }
2229 // Then attempt to interpret as binary property name with value name 'Y'.
2230 UProperty property = u_getPropertyEnum(name);
2231 if (!IsSupportedBinaryProperty(property, unicode_sets())) return false;
2232 if (!IsExactPropertyAlias(name, property)) return false;
2233 // Negation of properties with strings is not allowed.
2234 // See
2235 // https://tc39.es/ecma262/#sec-static-semantics-maycontainstrings
2236 if (negate && IsBinaryPropertyOfStrings(property)) return false;
2237 if (unicode_sets()) {
2238 // In /v mode we can't simple lookup the "false" binary property values,
2239 // as the spec requires us to perform case folding before calculating the
2240 // complement.
2241 // See https://tc39.es/ecma262/#sec-compiletocharset
2242 // UnicodePropertyValueExpression :: LoneUnicodePropertyNameOrValue
2243 return LookupPropertyValueName(property, "Y", negate, add_to_ranges,
2244 add_to_strings, flags(), zone());
2245 } else {
2246 return LookupPropertyValueName(property, negate ? "N" : "Y", false,
2247 add_to_ranges, add_to_strings, flags(),
2248 zone());
2249 }
2250 } else {
2251 // Both property name and value name are specified. Attempt to interpret
2252 // the property name as enumerated property.
2253 const char* property_name = name_1.data();
2254 const char* value_name = name_2.data();
2255 UProperty property = u_getPropertyEnum(property_name);
2256 if (!IsExactPropertyAlias(property_name, property)) return false;
2257 if (property == UCHAR_GENERAL_CATEGORY) {
2258 // We want to allow aggregate value names such as "Letter".
2259 property = UCHAR_GENERAL_CATEGORY_MASK;
2260 } else if (property != UCHAR_SCRIPT &&
2261 property != UCHAR_SCRIPT_EXTENSIONS) {
2262 return false;
2263 }
2264 return LookupPropertyValueName(property, value_name, negate, add_to_ranges,
2265 add_to_strings, flags(), zone());
2266 }
2267}
2268
2269#else // V8_INTL_SUPPORT
2270
2271template <class CharT>
2272bool RegExpParserImpl<CharT>::ParsePropertyClassName(ZoneVector<char>* name_1,
2273 ZoneVector<char>* name_2) {
2274 return false;
2275}
2276
2277template <class CharT>
2278bool RegExpParserImpl<CharT>::AddPropertyClassRange(
2279 ZoneList<CharacterRange>* add_to_ranges,
2280 CharacterClassStrings* add_to_strings, bool negate,
2281 const ZoneVector<char>& name_1, const ZoneVector<char>& name_2) {
2282 return false;
2283}
2284
2285#endif // V8_INTL_SUPPORT
2286
2287template <class CharT>
2288bool RegExpParserImpl<CharT>::ParseUnlimitedLengthHexNumber(int max_value,
2289 base::uc32* value) {
2290 base::uc32 x = 0;
2291 int d = base::HexValue(current());
2292 if (d < 0) {
2293 return false;
2294 }
2295 while (d >= 0) {
2296 x = x * 16 + d;
2297 if (x > static_cast<base::uc32>(max_value)) {
2298 return false;
2299 }
2300 Advance();
2301 d = base::HexValue(current());
2302 }
2303 *value = x;
2304 return true;
2305}
2306
2307// https://tc39.es/ecma262/#prod-CharacterEscape
2308template <class CharT>
2309base::uc32 RegExpParserImpl<CharT>::ParseCharacterEscape(
2310 InClassEscapeState in_class_escape_state,
2311 bool* is_escaped_unicode_character) {
2312 DCHECK_EQ('\\', current());
2313 DCHECK(has_next());
2314
2315 Advance();
2316
2317 const base::uc32 c = current();
2318 switch (c) {
2319 // CharacterEscape ::
2320 // ControlEscape :: one of
2321 // f n r t v
2322 case 'f':
2323 Advance();
2324 return '\f';
2325 case 'n':
2326 Advance();
2327 return '\n';
2328 case 'r':
2329 Advance();
2330 return '\r';
2331 case 't':
2332 Advance();
2333 return '\t';
2334 case 'v':
2335 Advance();
2336 return '\v';
2337 // CharacterEscape ::
2338 // c ControlLetter
2339 case 'c': {
2340 base::uc32 controlLetter = Next();
2341 base::uc32 letter = controlLetter & ~('A' ^ 'a');
2342 if (letter >= 'A' && letter <= 'Z') {
2343 Advance(2);
2344 // Control letters mapped to ASCII control characters in the range
2345 // 0x00-0x1F.
2346 return controlLetter & 0x1F;
2347 }
2348 if (IsUnicodeMode()) {
2349 // With /u and /v, invalid escapes are not treated as identity escapes.
2350 ReportError(RegExpError::kInvalidUnicodeEscape);
2351 return 0;
2352 }
2353 if (in_class_escape_state == InClassEscapeState::kInClass) {
2354 // Inside a character class, we also accept digits and underscore as
2355 // control characters, unless with /u or /v. See Annex B:
2356 // ES#prod-annexB-ClassControlLetter
2357 if ((controlLetter >= '0' && controlLetter <= '9') ||
2358 controlLetter == '_') {
2359 Advance(2);
2360 return controlLetter & 0x1F;
2361 }
2362 }
2363 // We match JSC in reading the backslash as a literal
2364 // character instead of as starting an escape.
2365 return '\\';
2366 }
2367 // CharacterEscape ::
2368 // 0 [lookahead ∉ DecimalDigit]
2369 // [~UnicodeMode] LegacyOctalEscapeSequence
2370 case '0':
2371 // \0 is interpreted as NUL if not followed by another digit.
2372 if (Next() < '0' || Next() > '9') {
2373 Advance();
2374 return 0;
2375 }
2376 [[fallthrough]];
2377 case '1':
2378 case '2':
2379 case '3':
2380 case '4':
2381 case '5':
2382 case '6':
2383 case '7':
2384 // For compatibility, we interpret a decimal escape that isn't
2385 // a back reference (and therefore either \0 or not valid according
2386 // to the specification) as a 1..3 digit octal character code.
2387 // ES#prod-annexB-LegacyOctalEscapeSequence
2388 if (IsUnicodeMode()) {
2389 // With /u or /v, decimal escape is not interpreted as octal character
2390 // code.
2391 ReportError(RegExpError::kInvalidDecimalEscape);
2392 return 0;
2393 }
2394 return ParseOctalLiteral();
2395 // CharacterEscape ::
2396 // HexEscapeSequence
2397 case 'x': {
2398 Advance();
2399 base::uc32 value;
2400 if (ParseHexEscape(2, &value)) return value;
2401 if (IsUnicodeMode()) {
2402 // With /u or /v, invalid escapes are not treated as identity escapes.
2403 ReportError(RegExpError::kInvalidEscape);
2404 return 0;
2405 }
2406 // If \x is not followed by a two-digit hexadecimal, treat it
2407 // as an identity escape.
2408 return 'x';
2409 }
2410 // CharacterEscape ::
2411 // RegExpUnicodeEscapeSequence [?UnicodeMode]
2412 case 'u': {
2413 Advance();
2414 base::uc32 value;
2415 if (ParseUnicodeEscape(&value)) {
2416 *is_escaped_unicode_character = true;
2417 return value;
2418 }
2419 if (IsUnicodeMode()) {
2420 // With /u or /v, invalid escapes are not treated as identity escapes.
2421 ReportError(RegExpError::kInvalidUnicodeEscape);
2422 return 0;
2423 }
2424 // If \u is not followed by a two-digit hexadecimal, treat it
2425 // as an identity escape.
2426 return 'u';
2427 }
2428 default:
2429 break;
2430 }
2431
2432 // CharacterEscape ::
2433 // IdentityEscape[?UnicodeMode, ?N]
2434 //
2435 // * With /u, no identity escapes except for syntax characters are
2436 // allowed.
2437 // * With /v, no identity escapes except for syntax characters and
2438 // ClassSetReservedPunctuators (if within a class) are allowed.
2439 // * Without /u or /v:
2440 // * '\c' is not an IdentityEscape.
2441 // * '\k' is not an IdentityEscape when named captures exist.
2442 // * Otherwise, all identity escapes are allowed.
2443 if (unicode_sets() && in_class_escape_state == InClassEscapeState::kInClass) {
2444 if (IsClassSetReservedPunctuator(c)) {
2445 Advance();
2446 return c;
2447 }
2448 }
2449 if (IsUnicodeMode()) {
2450 if (!IsSyntaxCharacterOrSlash(c)) {
2451 ReportError(RegExpError::kInvalidEscape);
2452 return 0;
2453 }
2454 Advance();
2455 return c;
2456 }
2457 DCHECK(!IsUnicodeMode());
2458 if (c == 'c') {
2459 ReportError(RegExpError::kInvalidEscape);
2460 return 0;
2461 }
2462 Advance();
2463 // Note: It's important to Advance before the HasNamedCaptures call s.t. we
2464 // don't start scanning in the middle of an escape.
2465 if (c == 'k' && HasNamedCaptures(in_class_escape_state)) {
2466 ReportError(RegExpError::kInvalidEscape);
2467 return 0;
2468 }
2469 return c;
2470}
2471
2472// https://tc39.es/ecma262/#prod-ClassRanges
2473template <class CharT>
2474RegExpTree* RegExpParserImpl<CharT>::ParseClassRanges(
2475 ZoneList<CharacterRange>* ranges, bool add_unicode_case_equivalents) {
2476 base::uc32 char_1, char_2;
2477 bool is_class_1, is_class_2;
2478 while (has_more() && current() != ']') {
2479 ParseClassEscape(ranges, zone(), add_unicode_case_equivalents, &char_1,
2480 &is_class_1 CHECK_FAILED);
2481 // ClassAtom
2482 if (current() == '-') {
2483 Advance();
2484 if (!has_more()) {
2485 // If we reach the end we break out of the loop and let the
2486 // following code report an error.
2487 break;
2488 } else if (current() == ']') {
2489 if (!is_class_1) ranges->Add(CharacterRange::Singleton(char_1), zone());
2490 ranges->Add(CharacterRange::Singleton('-'), zone());
2491 break;
2492 }
2493 ParseClassEscape(ranges, zone(), add_unicode_case_equivalents, &char_2,
2494 &is_class_2 CHECK_FAILED);
2495 if (is_class_1 || is_class_2) {
2496 // Either end is an escaped character class. Treat the '-' verbatim.
2497 if (IsUnicodeMode()) {
2498 // ES2015 21.2.2.15.1 step 1.
2499 return ReportError(RegExpError::kInvalidCharacterClass);
2500 }
2501 if (!is_class_1) ranges->Add(CharacterRange::Singleton(char_1), zone());
2502 ranges->Add(CharacterRange::Singleton('-'), zone());
2503 if (!is_class_2) ranges->Add(CharacterRange::Singleton(char_2), zone());
2504 continue;
2505 }
2506 // ES2015 21.2.2.15.1 step 6.
2507 if (char_1 > char_2) {
2508 return ReportError(RegExpError::kOutOfOrderCharacterClass);
2509 }
2510 ranges->Add(CharacterRange::Range(char_1, char_2), zone());
2511 } else {
2512 if (!is_class_1) ranges->Add(CharacterRange::Singleton(char_1), zone());
2513 }
2514 }
2515 return nullptr;
2516}
2517
2518// https://tc39.es/ecma262/#prod-ClassEscape
2519template <class CharT>
2520void RegExpParserImpl<CharT>::ParseClassEscape(
2521 ZoneList<CharacterRange>* ranges, Zone* zone,
2522 bool add_unicode_case_equivalents, base::uc32* char_out,
2523 bool* is_class_escape) {
2524 *is_class_escape = false;
2525
2526 if (current() != '\\') {
2527 // Not a ClassEscape.
2528 *char_out = current();
2529 Advance();
2530 return;
2531 }
2532
2533 const base::uc32 next = Next();
2534 switch (next) {
2535 case 'b':
2536 *char_out = '\b';
2537 Advance(2);
2538 return;
2539 case '-':
2540 if (IsUnicodeMode()) {
2541 *char_out = next;
2542 Advance(2);
2543 return;
2544 }
2545 break;
2546 case kEndMarker:
2547 ReportError(RegExpError::kEscapeAtEndOfPattern);
2548 return;
2549 default:
2550 break;
2551 }
2552
2553 static constexpr InClassEscapeState kInClassEscape =
2554 InClassEscapeState::kInClass;
2555 *is_class_escape =
2556 TryParseCharacterClassEscape(next, kInClassEscape, ranges, nullptr, zone,
2557 add_unicode_case_equivalents);
2558 if (*is_class_escape) return;
2559
2560 bool dummy = false; // Unused.
2561 *char_out = ParseCharacterEscape(kInClassEscape, &dummy);
2562}
2563
2564// https://tc39.es/ecma262/#prod-CharacterClassEscape
2565template <class CharT>
2566bool RegExpParserImpl<CharT>::TryParseCharacterClassEscape(
2567 base::uc32 next, InClassEscapeState in_class_escape_state,
2568 ZoneList<CharacterRange>* ranges, CharacterClassStrings* strings,
2569 Zone* zone, bool add_unicode_case_equivalents) {
2570 DCHECK_EQ(current(), '\\');
2571 DCHECK_EQ(Next(), next);
2572
2573 switch (next) {
2574 case 'd':
2575 case 'D':
2576 case 's':
2577 case 'S':
2578 case 'w':
2579 case 'W':
2580 CharacterRange::AddClassEscape(static_cast<StandardCharacterSet>(next),
2581 ranges, add_unicode_case_equivalents,
2582 zone);
2583 Advance(2);
2584 return true;
2585 case 'p':
2586 case 'P': {
2587 if (!IsUnicodeMode()) return false;
2588 bool negate = next == 'P';
2589 Advance(2);
2590 ZoneVector<char> name_1(zone);
2591 ZoneVector<char> name_2(zone);
2592 if (!ParsePropertyClassName(&name_1, &name_2) ||
2593 !AddPropertyClassRange(ranges, strings, negate, name_1, name_2)) {
2594 ReportError(in_class_escape_state == InClassEscapeState::kInClass
2595 ? RegExpError::kInvalidClassPropertyName
2596 : RegExpError::kInvalidPropertyName);
2597 }
2598 return true;
2599 }
2600 default:
2601 return false;
2602 }
2603}
2604
2605namespace {
2606
2607// Add |string| to |ranges| if length of |string| == 1, otherwise add |string|
2608// to |strings|.
2609void AddClassString(ZoneList<base::uc32>* normalized_string,
2610 RegExpTree* regexp_string, ZoneList<CharacterRange>* ranges,
2611 CharacterClassStrings* strings, Zone* zone) {
2612 if (normalized_string->length() == 1) {
2613 ranges->Add(CharacterRange::Singleton(normalized_string->at(0)), zone);
2614 } else {
2615 strings->emplace(normalized_string->ToVector(), regexp_string);
2616 }
2617}
2618
2619} // namespace
2620
2621// https://tc39.es/ecma262/#prod-ClassStringDisjunction
2622template <class CharT>
2623RegExpTree* RegExpParserImpl<CharT>::ParseClassStringDisjunction(
2624 ZoneList<CharacterRange>* ranges, CharacterClassStrings* strings) {
2625 DCHECK(unicode_sets());
2626 DCHECK_EQ(current(), '\\');
2627 DCHECK_EQ(Next(), 'q');
2628 Advance(2);
2629 if (current() != '{') {
2630 // Identity escape of 'q' is not allowed in unicode mode.
2631 return ReportError(RegExpError::kInvalidEscape);
2632 }
2633 Advance();
2634
2635 ZoneList<base::uc32>* string =
2636 zone()->template New<ZoneList<base::uc32>>(4, zone());
2637 RegExpTextBuilder::SmallRegExpTreeVector string_storage(zone());
2638 RegExpTextBuilder string_builder(zone(), &string_storage, flags());
2639
2640 while (has_more() && current() != '}') {
2641 if (current() == '|') {
2642 AddClassString(string, string_builder.ToRegExp(), ranges, strings,
2643 zone());
2644 string = zone()->template New<ZoneList<base::uc32>>(4, zone());
2645 string_storage.clear();
2646 Advance();
2647 } else {
2648 base::uc32 c = ParseClassSetCharacter(CHECK_FAILED);
2649 if (ignore_case()) {
2650#ifdef V8_INTL_SUPPORT
2651 c = u_foldCase(c, U_FOLD_CASE_DEFAULT);
2652#else
2653 c = AsciiAlphaToLower(c);
2654#endif
2655 }
2656 string->Add(c, zone());
2657 string_builder.AddUnicodeCharacter(c);
2658 }
2659 }
2660
2661 AddClassString(string, string_builder.ToRegExp(), ranges, strings, zone());
2662 CharacterRange::Canonicalize(ranges);
2663
2664 // We don't need to handle missing closing '}' here.
2665 // If the character class is correctly closed, ParseClassSetCharacter will
2666 // report an error.
2667 Advance();
2668 return nullptr;
2669}
2670
2671// https://tc39.es/ecma262/#prod-ClassSetOperand
2672// Tree returned based on type_out:
2673// * kNestedClass: RegExpClassSetExpression
2674// * For all other types: RegExpClassSetOperand
2675template <class CharT>
2676RegExpTree* RegExpParserImpl<CharT>::ParseClassSetOperand(
2677 const RegExpBuilder* builder, ClassSetOperandType* type_out) {
2678 ZoneList<CharacterRange>* ranges =
2679 zone()->template New<ZoneList<CharacterRange>>(1, zone());
2680 CharacterClassStrings* strings =
2681 zone()->template New<CharacterClassStrings>(zone());
2682 base::uc32 character;
2683 RegExpTree* tree = ParseClassSetOperand(builder, type_out, ranges, strings,
2684 &character CHECK_FAILED);
2685 DCHECK_IMPLIES(*type_out != ClassSetOperandType::kNestedClass,
2686 tree == nullptr);
2687 DCHECK_IMPLIES(*type_out == ClassSetOperandType::kClassSetCharacter,
2688 ranges->is_empty());
2689 DCHECK_IMPLIES(*type_out == ClassSetOperandType::kClassSetCharacter,
2690 strings->empty());
2691 DCHECK_IMPLIES(*type_out == ClassSetOperandType::kNestedClass,
2692 ranges->is_empty());
2693 DCHECK_IMPLIES(*type_out == ClassSetOperandType::kNestedClass,
2694 strings->empty());
2695 DCHECK_IMPLIES(*type_out == ClassSetOperandType::kNestedClass,
2696 tree->IsClassSetExpression());
2697 // ClassSetRange is only used within ClassSetUnion().
2698 DCHECK_NE(*type_out, ClassSetOperandType::kClassSetRange);
2699 // There are no restrictions for kCharacterClassEscape.
2700 // CharacterClassEscape includes \p{}, which can contain ranges, strings or
2701 // both and \P{}, which could contain nothing (i.e. \P{Any}).
2702 if (tree == nullptr) {
2703 if (*type_out == ClassSetOperandType::kClassSetCharacter) {
2704 AddMaybeSimpleCaseFoldedRange(ranges,
2705 CharacterRange::Singleton(character));
2706 }
2707 tree = zone()->template New<RegExpClassSetOperand>(ranges, strings);
2708 }
2709 return tree;
2710}
2711
2712// https://tc39.es/ecma262/#prod-ClassSetOperand
2713// Based on |type_out| either a tree is returned or
2714// |ranges|/|strings|/|character| modified. If a tree is returned,
2715// ranges/strings are not modified. If |type_out| is kNestedClass, a tree of
2716// type RegExpClassSetExpression is returned. If | type_out| is
2717// kClassSetCharacter, |character| is set and nullptr returned. For all other
2718// types, |ranges|/|strings|/|character| is modified and nullptr is returned.
2719template <class CharT>
2720RegExpTree* RegExpParserImpl<CharT>::ParseClassSetOperand(
2721 const RegExpBuilder* builder, ClassSetOperandType* type_out,
2722 ZoneList<CharacterRange>* ranges, CharacterClassStrings* strings,
2723 base::uc32* character) {
2724 DCHECK(unicode_sets());
2725 base::uc32 c = current();
2726 if (c == '\\') {
2727 const base::uc32 next = Next();
2728 if (next == 'q') {
2729 *type_out = ClassSetOperandType::kClassStringDisjunction;
2730 ParseClassStringDisjunction(ranges, strings CHECK_FAILED);
2731 return nullptr;
2732 }
2733 static constexpr InClassEscapeState kInClassEscape =
2734 InClassEscapeState::kInClass;
2735 const bool add_unicode_case_equivalents = ignore_case();
2736 if (TryParseCharacterClassEscape(next, kInClassEscape, ranges, strings,
2737 zone(), add_unicode_case_equivalents)) {
2738 *type_out = ClassSetOperandType::kCharacterClassEscape;
2739 return nullptr;
2740 }
2741 }
2742
2743 if (c == '[') {
2744 *type_out = ClassSetOperandType::kNestedClass;
2745 return ParseCharacterClass(builder);
2746 }
2747
2748 *type_out = ClassSetOperandType::kClassSetCharacter;
2749 c = ParseClassSetCharacter(CHECK_FAILED);
2750 *character = c;
2751 return nullptr;
2752}
2753
2754template <class CharT>
2755base::uc32 RegExpParserImpl<CharT>::ParseClassSetCharacter() {
2756 DCHECK(unicode_sets());
2757 const base::uc32 c = current();
2758 if (c == '\\') {
2759 const base::uc32 next = Next();
2760 switch (next) {
2761 case 'b':
2762 Advance(2);
2763 return '\b';
2764 case kEndMarker:
2765 ReportError(RegExpError::kEscapeAtEndOfPattern);
2766 return 0;
2767 }
2768 static constexpr InClassEscapeState kInClassEscape =
2769 InClassEscapeState::kInClass;
2770
2771 bool dummy = false; // Unused.
2772 return ParseCharacterEscape(kInClassEscape, &dummy);
2773 }
2774 if (IsClassSetSyntaxCharacter(c)) {
2775 ReportError(RegExpError::kInvalidCharacterInClass);
2776 return 0;
2777 }
2778 if (IsClassSetReservedDoublePunctuator(c)) {
2779 ReportError(RegExpError::kInvalidClassSetOperation);
2780 return 0;
2781 }
2782 Advance();
2783 return c;
2784}
2785
2786namespace {
2787
2788bool MayContainStrings(ClassSetOperandType type, RegExpTree* operand) {
2789 switch (type) {
2790 case ClassSetOperandType::kClassSetCharacter:
2791 case ClassSetOperandType::kClassSetRange:
2792 return false;
2793 case ClassSetOperandType::kCharacterClassEscape:
2794 case ClassSetOperandType::kClassStringDisjunction:
2795 return operand->AsClassSetOperand()->has_strings();
2796 case ClassSetOperandType::kNestedClass:
2797 if (operand->IsClassRanges()) return false;
2798 return operand->AsClassSetExpression()->may_contain_strings();
2799 }
2800}
2801
2802} // namespace
2803
2804template <class CharT>
2805void RegExpParserImpl<CharT>::AddMaybeSimpleCaseFoldedRange(
2806 ZoneList<CharacterRange>* ranges, CharacterRange new_range) {
2807 DCHECK(unicode_sets());
2808 if (ignore_case()) {
2809 ZoneList<CharacterRange>* new_ranges =
2810 zone()->template New<ZoneList<CharacterRange>>(2, zone());
2811 new_ranges->Add(new_range, zone());
2812 CharacterRange::AddUnicodeCaseEquivalents(new_ranges, zone());
2813 ranges->AddAll(*new_ranges, zone());
2814 } else {
2815 ranges->Add(new_range, zone());
2816 }
2817 CharacterRange::Canonicalize(ranges);
2818}
2819
2820// https://tc39.es/ecma262/#prod-ClassUnion
2821template <class CharT>
2822RegExpTree* RegExpParserImpl<CharT>::ParseClassUnion(
2823 const RegExpBuilder* builder, bool is_negated, RegExpTree* first_operand,
2824 ClassSetOperandType first_operand_type, ZoneList<CharacterRange>* ranges,
2825 CharacterClassStrings* strings, base::uc32 character) {
2826 DCHECK(unicode_sets());
2827 ZoneList<RegExpTree*>* operands =
2828 zone()->template New<ZoneList<RegExpTree*>>(2, zone());
2829 bool may_contain_strings = false;
2830 // Add the lhs to operands if necessary.
2831 // Either the lhs values were added to |ranges|/|strings| (in which case
2832 // |first_operand| is nullptr), or the lhs was evaluated to a tree and passed
2833 // as |first_operand| (in which case |ranges| and |strings| are empty).
2834 if (first_operand != nullptr) {
2835 may_contain_strings = MayContainStrings(first_operand_type, first_operand);
2836 operands->Add(first_operand, zone());
2837 }
2838 ClassSetOperandType last_type = first_operand_type;
2839 while (has_more() && current() != ']') {
2840 if (current() == '-') {
2841 // Mix of ClassSetRange and ClassSubtraction is not allowed.
2842 if (Next() == '-') {
2843 return ReportError(RegExpError::kInvalidClassSetOperation);
2844 }
2845 Advance();
2846 if (!has_more()) {
2847 // If we reach the end we break out of the loop and let the
2848 // following code report an error.
2849 break;
2850 }
2851 // If the lhs and rhs around '-' are both ClassSetCharacters, they
2852 // represent a character range.
2853 // In case one of them is not a ClassSetCharacter, it is a syntax error,
2854 // as '-' can not be used unescaped within a class with /v.
2855 // See
2856 // https://tc39.es/ecma262/#prod-ClassSetRange
2857 if (last_type != ClassSetOperandType::kClassSetCharacter) {
2858 return ReportError(RegExpError::kInvalidCharacterClass);
2859 }
2860 base::uc32 from = character;
2861 ParseClassSetOperand(builder, &last_type, ranges, strings,
2862 &character CHECK_FAILED);
2863 if (last_type != ClassSetOperandType::kClassSetCharacter) {
2864 return ReportError(RegExpError::kInvalidCharacterClass);
2865 }
2866 if (from > character) {
2867 return ReportError(RegExpError::kOutOfOrderCharacterClass);
2868 }
2869 AddMaybeSimpleCaseFoldedRange(ranges,
2870 CharacterRange::Range(from, character));
2871 last_type = ClassSetOperandType::kClassSetRange;
2872 } else {
2873 DCHECK_NE(current(), '-');
2874 if (last_type == ClassSetOperandType::kClassSetCharacter) {
2875 AddMaybeSimpleCaseFoldedRange(ranges,
2876 CharacterRange::Singleton(character));
2877 }
2878 RegExpTree* operand = ParseClassSetOperand(
2879 builder, &last_type, ranges, strings, &character CHECK_FAILED);
2880 if (operand != nullptr) {
2881 may_contain_strings |= MayContainStrings(last_type, operand);
2882 // Add the range we started building as operand and reset the current
2883 // range.
2884 if (!ranges->is_empty() || !strings->empty()) {
2885 may_contain_strings |= !strings->empty();
2886 operands->Add(
2887 zone()->template New<RegExpClassSetOperand>(ranges, strings),
2888 zone());
2889 ranges = zone()->template New<ZoneList<CharacterRange>>(2, zone());
2890 strings = zone()->template New<CharacterClassStrings>(zone());
2891 }
2892 operands->Add(operand, zone());
2893 }
2894 }
2895 }
2896
2897 if (!has_more()) {
2898 return ReportError(RegExpError::kUnterminatedCharacterClass);
2899 }
2900
2901 if (last_type == ClassSetOperandType::kClassSetCharacter) {
2902 AddMaybeSimpleCaseFoldedRange(ranges, CharacterRange::Singleton(character));
2903 }
2904
2905 // Add the range we started building as operand.
2906 if (!ranges->is_empty() || !strings->empty()) {
2907 may_contain_strings |= !strings->empty();
2908 operands->Add(zone()->template New<RegExpClassSetOperand>(ranges, strings),
2909 zone());
2910 }
2911
2912 DCHECK_EQ(current(), ']');
2913 Advance();
2914
2915 if (is_negated && may_contain_strings) {
2916 return ReportError(RegExpError::kNegatedCharacterClassWithStrings);
2917 }
2918
2919 if (operands->is_empty()) {
2920 // Return empty expression if no operands were added (e.g. [\P{Any}]
2921 // produces an empty range).
2922 DCHECK(ranges->is_empty());
2923 DCHECK(strings->empty());
2924 return RegExpClassSetExpression::Empty(zone(), is_negated);
2925 }
2926
2927 return zone()->template New<RegExpClassSetExpression>(
2928 RegExpClassSetExpression::OperationType::kUnion, is_negated,
2929 may_contain_strings, operands);
2930}
2931
2932// https://tc39.es/ecma262/#prod-ClassIntersection
2933template <class CharT>
2934RegExpTree* RegExpParserImpl<CharT>::ParseClassIntersection(
2935 const RegExpBuilder* builder, bool is_negated, RegExpTree* first_operand,
2936 ClassSetOperandType first_operand_type) {
2937 DCHECK(unicode_sets());
2938 DCHECK(current() == '&' && Next() == '&');
2939 bool may_contain_strings =
2940 MayContainStrings(first_operand_type, first_operand);
2941 ZoneList<RegExpTree*>* operands =
2942 zone()->template New<ZoneList<RegExpTree*>>(2, zone());
2943 operands->Add(first_operand, zone());
2944 while (has_more() && current() != ']') {
2945 if (current() != '&' || Next() != '&') {
2946 return ReportError(RegExpError::kInvalidClassSetOperation);
2947 }
2948 Advance(2);
2949 // [lookahead ≠ &]
2950 if (current() == '&') {
2951 return ReportError(RegExpError::kInvalidCharacterInClass);
2952 }
2953
2954 ClassSetOperandType operand_type;
2955 RegExpTree* operand =
2956 ParseClassSetOperand(builder, &operand_type CHECK_FAILED);
2957 may_contain_strings &= MayContainStrings(operand_type, operand);
2958 operands->Add(operand, zone());
2959 }
2960 if (!has_more()) {
2961 return ReportError(RegExpError::kUnterminatedCharacterClass);
2962 }
2963 if (is_negated && may_contain_strings) {
2964 return ReportError(RegExpError::kNegatedCharacterClassWithStrings);
2965 }
2966 DCHECK_EQ(current(), ']');
2967 Advance();
2968 return zone()->template New<RegExpClassSetExpression>(
2969 RegExpClassSetExpression::OperationType::kIntersection, is_negated,
2970 may_contain_strings, operands);
2971}
2972
2973// https://tc39.es/ecma262/#prod-ClassSubtraction
2974template <class CharT>
2975RegExpTree* RegExpParserImpl<CharT>::ParseClassSubtraction(
2976 const RegExpBuilder* builder, bool is_negated, RegExpTree* first_operand,
2977 ClassSetOperandType first_operand_type) {
2978 DCHECK(unicode_sets());
2979 DCHECK(current() == '-' && Next() == '-');
2980 const bool may_contain_strings =
2981 MayContainStrings(first_operand_type, first_operand);
2982 if (is_negated && may_contain_strings) {
2983 return ReportError(RegExpError::kNegatedCharacterClassWithStrings);
2984 }
2985 ZoneList<RegExpTree*>* operands =
2986 zone()->template New<ZoneList<RegExpTree*>>(2, zone());
2987 operands->Add(first_operand, zone());
2988 while (has_more() && current() != ']') {
2989 if (current() != '-' || Next() != '-') {
2990 return ReportError(RegExpError::kInvalidClassSetOperation);
2991 }
2992 Advance(2);
2993 ClassSetOperandType dummy; // unused
2994 RegExpTree* operand = ParseClassSetOperand(builder, &dummy CHECK_FAILED);
2995 operands->Add(operand, zone());
2996 }
2997 if (!has_more()) {
2998 return ReportError(RegExpError::kUnterminatedCharacterClass);
2999 }
3000 DCHECK_EQ(current(), ']');
3001 Advance();
3002 return zone()->template New<RegExpClassSetExpression>(
3003 RegExpClassSetExpression::OperationType::kSubtraction, is_negated,
3004 may_contain_strings, operands);
3005}
3006
3007// https://tc39.es/ecma262/#prod-CharacterClass
3008template <class CharT>
3009RegExpTree* RegExpParserImpl<CharT>::ParseCharacterClass(
3010 const RegExpBuilder* builder) {
3011 DCHECK_EQ(current(), '[');
3012 Advance();
3013 bool is_negated = false;
3014 if (current() == '^') {
3015 is_negated = true;
3016 Advance();
3017 }
3018 ZoneList<CharacterRange>* ranges =
3019 zone()->template New<ZoneList<CharacterRange>>(2, zone());
3020 if (current() == ']') {
3021 Advance();
3022 if (unicode_sets()) {
3023 return RegExpClassSetExpression::Empty(zone(), is_negated);
3024 } else {
3025 RegExpClassRanges::ClassRangesFlags class_ranges_flags;
3026 if (is_negated) class_ranges_flags = RegExpClassRanges::NEGATED;
3027 return zone()->template New<RegExpClassRanges>(zone(), ranges,
3028 class_ranges_flags);
3029 }
3030 }
3031
3032 if (!unicode_sets()) {
3033 bool add_unicode_case_equivalents = IsUnicodeMode() && ignore_case();
3034 ParseClassRanges(ranges, add_unicode_case_equivalents CHECK_FAILED);
3035 if (!has_more()) {
3036 return ReportError(RegExpError::kUnterminatedCharacterClass);
3037 }
3038 DCHECK_EQ(current(), ']');
3039 Advance();
3040 RegExpClassRanges::ClassRangesFlags character_class_flags;
3041 if (is_negated) character_class_flags = RegExpClassRanges::NEGATED;
3042 return zone()->template New<RegExpClassRanges>(zone(), ranges,
3043 character_class_flags);
3044 } else {
3045 ClassSetOperandType operand_type;
3046 CharacterClassStrings* strings =
3047 zone()->template New<CharacterClassStrings>(zone());
3048 base::uc32 character;
3049 RegExpTree* operand = ParseClassSetOperand(
3050 builder, &operand_type, ranges, strings, &character CHECK_FAILED);
3051 switch (current()) {
3052 case '-':
3053 if (Next() == '-') {
3054 if (operand == nullptr) {
3055 if (operand_type == ClassSetOperandType::kClassSetCharacter) {
3056 AddMaybeSimpleCaseFoldedRange(
3057 ranges, CharacterRange::Singleton(character));
3058 }
3059 operand =
3060 zone()->template New<RegExpClassSetOperand>(ranges, strings);
3061 }
3062 return ParseClassSubtraction(builder, is_negated, operand,
3063 operand_type);
3064 }
3065 // ClassSetRange is handled in ParseClassUnion().
3066 break;
3067 case '&':
3068 if (Next() == '&') {
3069 if (operand == nullptr) {
3070 if (operand_type == ClassSetOperandType::kClassSetCharacter) {
3071 AddMaybeSimpleCaseFoldedRange(
3072 ranges, CharacterRange::Singleton(character));
3073 }
3074 operand =
3075 zone()->template New<RegExpClassSetOperand>(ranges, strings);
3076 }
3077 return ParseClassIntersection(builder, is_negated, operand,
3078 operand_type);
3079 }
3080 }
3081 return ParseClassUnion(builder, is_negated, operand, operand_type, ranges,
3082 strings, character);
3083 }
3084}
3085
3086#undef CHECK_FAILED
3087
3088template <class CharT>
3089bool RegExpParserImpl<CharT>::Parse(RegExpCompileData* result) {
3091 RegExpTree* tree = ParsePattern();
3092
3093 if (failed()) {
3094 DCHECK_NULL(tree);
3095 DCHECK_NE(error_, RegExpError::kNone);
3096 result->error = error_;
3097 result->error_pos = error_pos_;
3098 return false;
3099 }
3100
3101 DCHECK_NOT_NULL(tree);
3102 DCHECK_EQ(error_, RegExpError::kNone);
3103 if (v8_flags.trace_regexp_parser) {
3104 StdoutStream os;
3105 tree->Print(os, zone());
3106 os << "\n";
3107 }
3108
3109 result->tree = tree;
3110 const int capture_count = captures_started();
3111 result->simple = tree->IsAtom() && simple() && capture_count == 0;
3112 result->contains_anchor = contains_anchor();
3113 result->capture_count = capture_count;
3114 result->named_captures = GetNamedCaptures();
3115 return true;
3116}
3117
3118void RegExpBuilder::FlushText() { text_builder().FlushText(); }
3119
3120void RegExpBuilder::AddCharacter(base::uc16 c) {
3121 pending_empty_ = false;
3122 text_builder().AddCharacter(c);
3123}
3124
3125void RegExpBuilder::AddUnicodeCharacter(base::uc32 c) {
3126 pending_empty_ = false;
3127 text_builder().AddUnicodeCharacter(c);
3128}
3129
3130void RegExpBuilder::AddEscapedUnicodeCharacter(base::uc32 character) {
3131 pending_empty_ = false;
3132 text_builder().AddEscapedUnicodeCharacter(character);
3133}
3134
3135void RegExpBuilder::AddEmpty() {
3136 text_builder().FlushPendingSurrogate();
3137 pending_empty_ = true;
3138}
3139
3140void RegExpBuilder::AddClassRanges(RegExpClassRanges* cc) {
3141 pending_empty_ = false;
3142 text_builder().AddClassRanges(cc);
3143}
3144
3145void RegExpBuilder::AddAtom(RegExpTree* term) {
3146 if (term->IsEmpty()) {
3147 AddEmpty();
3148 return;
3149 }
3150 pending_empty_ = false;
3151 if (term->IsTextElement()) {
3152 text_builder().AddAtom(term);
3153 } else {
3154 FlushText();
3155 terms_.emplace_back(term);
3156 }
3157}
3158
3159void RegExpBuilder::AddTerm(RegExpTree* term) {
3160 DCHECK(!term->IsEmpty());
3161 pending_empty_ = false;
3162 if (term->IsTextElement()) {
3163 text_builder().AddTerm(term);
3164 } else {
3165 FlushText();
3166 terms_.emplace_back(term);
3167 }
3168}
3169
3170void RegExpBuilder::AddAssertion(RegExpTree* assert) {
3171 FlushText();
3172 pending_empty_ = false;
3173 terms_.emplace_back(assert);
3174}
3175
3176void RegExpBuilder::NewAlternative() { FlushTerms(); }
3177
3178void RegExpBuilder::FlushTerms() {
3179 FlushText();
3180 size_t num_terms = terms_.size();
3181 RegExpTree* alternative;
3182 if (num_terms == 0) {
3183 alternative = zone()->New<RegExpEmpty>();
3184 } else if (num_terms == 1) {
3185 alternative = terms_.back();
3186 } else {
3187 alternative =
3188 zone()->New<RegExpAlternative>(zone()->New<ZoneList<RegExpTree*>>(
3189 base::VectorOf(terms_.begin(), terms_.size()), zone()));
3190 }
3191 alternatives_.emplace_back(alternative);
3192 terms_.clear();
3193}
3194
3195RegExpTree* RegExpBuilder::ToRegExp() {
3196 FlushTerms();
3197 size_t num_alternatives = alternatives_.size();
3198 if (num_alternatives == 0) return zone()->New<RegExpEmpty>();
3199 if (num_alternatives == 1) return alternatives_.back();
3200 return zone()->New<RegExpDisjunction>(zone()->New<ZoneList<RegExpTree*>>(
3201 base::VectorOf(alternatives_.begin(), alternatives_.size()), zone()));
3202}
3203
3204bool RegExpBuilder::AddQuantifierToAtom(
3205 int min, int max, int index,
3206 RegExpQuantifier::QuantifierType quantifier_type) {
3207 if (pending_empty_) {
3208 pending_empty_ = false;
3209 return true;
3210 }
3211 RegExpTree* atom = text_builder().PopLastAtom();
3212 if (atom != nullptr) {
3213 FlushText();
3214 } else if (!terms_.empty()) {
3215 atom = terms_.back();
3216 terms_.pop_back();
3217 if (atom->IsLookaround()) {
3218 // With /u or /v, lookarounds are not quantifiable.
3219 if (IsUnicodeMode()) return false;
3220 // Lookbehinds are not quantifiable.
3221 if (atom->AsLookaround()->type() == RegExpLookaround::LOOKBEHIND) {
3222 return false;
3223 }
3224 }
3225 if (atom->max_match() == 0) {
3226 // Guaranteed to only match an empty string.
3227 if (min == 0) {
3228 return true;
3229 }
3230 terms_.emplace_back(atom);
3231 return true;
3232 }
3233 } else {
3234 // Only call immediately after adding an atom or character!
3235 UNREACHABLE();
3236 }
3237 terms_.emplace_back(
3238 zone()->New<RegExpQuantifier>(min, max, quantifier_type, index, atom));
3239 return true;
3240}
3241
3242template class RegExpParserImpl<uint8_t>;
3243template class RegExpParserImpl<base::uc16>;
3244
3245} // namespace
3246
3247// static
3248bool RegExpParser::ParseRegExpFromHeapString(Isolate* isolate, Zone* zone,
3250 RegExpFlags flags,
3253 uintptr_t stack_limit = isolate->stack_guard()->real_climit();
3254 String::FlatContent content = input->GetFlatContent(no_gc);
3255 if (content.IsOneByte()) {
3257 return RegExpParserImpl<uint8_t>{v.begin(), v.length(), flags,
3258 stack_limit, zone, no_gc}
3259 .Parse(result);
3260 } else {
3262 return RegExpParserImpl<base::uc16>{v.begin(), v.length(), flags,
3263 stack_limit, zone, no_gc}
3264 .Parse(result);
3265 }
3266}
3267
3268// static
3269template <class CharT>
3270bool RegExpParser::VerifyRegExpSyntax(Zone* zone, uintptr_t stack_limit,
3271 const CharT* input, int input_length,
3272 RegExpFlags flags,
3274 const DisallowGarbageCollection& no_gc) {
3275 return RegExpParserImpl<CharT>{input, input_length, flags,
3276 stack_limit, zone, no_gc}
3277 .Parse(result);
3278}
3279
3280template bool RegExpParser::VerifyRegExpSyntax<uint8_t>(
3281 Zone*, uintptr_t, const uint8_t*, int, RegExpFlags, RegExpCompileData*,
3283template bool RegExpParser::VerifyRegExpSyntax<base::uc16>(
3284 Zone*, uintptr_t, const base::uc16*, int, RegExpFlags, RegExpCompileData*,
3286
3287} // namespace internal
3288} // namespace v8
friend Zone
Definition asm-types.cc:195
const char * name
Definition builtins.cc:39
#define SBXCHECK(condition)
Definition check.h:61
SourcePosition pos
static uint16_t LeadSurrogate(uint32_t char_code)
Definition unicode.h:126
static const uchar kMaxNonSurrogateCharCode
Definition unicode.h:116
static uint16_t TrailSurrogate(uint32_t char_code)
Definition unicode.h:129
static int CombineSurrogatePair(uchar lead, uchar trail)
Definition unicode.h:113
static bool IsTrailSurrogate(int code)
Definition unicode.h:109
static bool IsLeadSurrogate(int code)
Definition unicode.h:106
int length() const
Definition vector.h:64
constexpr T * begin() const
Definition vector.h:96
static V8_EXPORT_PRIVATE void AddClassEscape(StandardCharacterSet standard_character_set, ZoneList< CharacterRange > *ranges, bool add_unicode_case_equivalents, Zone *zone)
static const int kInfinity
Definition regexp-ast.h:196
base::Vector< const uint8_t > ToOneByteVector() const
Definition string.h:139
base::Vector< const base::uc16 > ToUC16Vector() const
Definition string.h:145
Zone * zone_
XMMRegister const input_
JSRegExp::Flags flags_
int start
LineAndColumn current
DisallowGarbageCollection no_gc_
other heap size flags(e.g. initial_heap_size) take precedence") DEFINE_SIZE_T( max_shared_heap_size
OptionalOpIndex index
const std::string property
double second
ZoneVector< RpoNumber > & result
Point from
int x
FunctionLiteral * literal
Definition liveedit.cc:294
int position
Definition liveedit.cc:290
int n
Definition mul-fft.cc:296
int int32_t
Definition unicode.cc:40
uint32_t uc32
Definition strings.h:19
uint16_t uc16
Definition strings.h:18
void ReportError(Args &&... args)
Definition utils.h:96
constexpr std::optional< RegExpFlag > TryRegExpFlagFromChar(char c)
bool IsIdentifierStart(base::uc32 c)
PerThreadAssertScopeDebugOnly< false, SAFEPOINTS_ASSERT, HEAP_ALLOCATION_ASSERT > DisallowGarbageCollection
BUILTIN_FP_CALL BUILTIN_FP_CALL BUILTIN_FP_CALL BUILTIN_FP_CALL BUILTIN_FP_CALL BUILTIN_FP_CALL BUILTIN_FP_CALL BUILTIN_FP_CALL BUILTIN_FP_CALL BUILTIN_FP_CALL int character
base::Flags< RegExpFlag > RegExpFlags
uintptr_t GetCurrentStackPosition()
Definition utils.cc:222
ZoneMap< base::Vector< const base::uc32 >, RegExpTree *, CharacterClassStringLess > CharacterClassStrings
Definition regexp-ast.h:392
constexpr bool IsDecimalDigit(base::uc32 c)
V8_EXPORT_PRIVATE FlagValues v8_flags
constexpr int AsciiAlphaToLower(base::uc32 c)
bool IsIdentifierPart(base::uc32 c)
ZoneList< RegExpTree * > * terms_
#define DOUBLE_PUNCTUATOR_CASE(Char)
RegExpParserState *const previous_state_
int error_pos_
RegExpTextBuilder text_builder_
base::uc32 current_
RegExpError error_
int capture_count_
bool force_unicode_
const int input_length_
ZoneList< RegExpBackReference * > * named_back_references_
ZoneMap< RegExpCapture *, ZoneList< int > *, RegExpCaptureNameLess > * named_captures_
bool has_named_captures_
Zone *const zone_
ZoneList< base::uc16 > * characters_
bool failed_
bool contains_anchor_
std::pair< int, int > non_participating_capture_group_interval_
const ZoneVector< base::uc16 > *const capture_name_
static const base::uc16 kNoPendingSurrogate
base::uc16 pending_surrogate_
RegExpParserImpl< CharT > *const parser_
SmallRegExpTreeVector text_
ZoneList< RegExpCapture * > * captures_
RegExpBuilder builder_
bool pending_empty_
bool has_more_
const int disjunction_capture_index_
bool simple_
SmallRegExpTreeVector alternatives_
const SubexpressionType group_type_
SmallRegExpTreeVector * terms_
const RegExpLookaround::Type lookaround_type_
#define CHECK_FAILED
const uintptr_t stack_limit_
int next_pos_
const RegExpFlags flags_
int captures_started_
bool is_scanned_for_captures_
int quantifier_count_
static const base::uc32 kEndMarker
int lookaround_count_
#define UNREACHABLE()
Definition logging.h:67
#define FATAL(...)
Definition logging.h:47
#define DCHECK_NULL(val)
Definition logging.h:491
#define CHECK(condition)
Definition logging.h:124
#define DCHECK_NOT_NULL(val)
Definition logging.h:492
#define DCHECK_IMPLIES(v1, v2)
Definition logging.h:493
#define DCHECK_NE(v1, v2)
Definition logging.h:486
#define DCHECK(condition)
Definition logging.h:482
#define DCHECK_EQ(v1, v2)
Definition logging.h:485
std::unique_ptr< ValueMirror > value