regexp-compiler-tonode_8cc_source.html

// Copyright 2019 the V8 project authors. All rights reserved.

// Use of this source code is governed by a BSD-style license that can be

// found in the LICENSE file.


#include "src/common/globals.h"

#include "src/execution/isolate.h"

#include "src/objects/string.h"

#include "src/regexp/regexp-compiler.h"

#include "src/regexp/regexp.h"

#include "src/strings/unicode-inl.h"

#include "src/zone/zone-list-inl.h"


#ifdef V8_INTL_SUPPORT

#include "src/base/strings.h"

#include "src/regexp/special-case.h"

#include "unicode/locid.h"

#include "unicode/uniset.h"

#include "unicode/utypes.h"

#endif  // V8_INTL_SUPPORT


namespace v8 {

namespace internal {


using namespace regexp_compiler_constants;  // NOLINT(build/namespaces)


constexpr base::uc32 kMaxCodePoint = 0x10ffff;

constexpr int kMaxUtf16CodeUnit = 0xffff;

constexpr uint32_t kMaxUtf16CodeUnitU = 0xffff;


// -------------------------------------------------------------------

// Tree to graph conversion


RegExpNode* RegExpAtom::ToNode(RegExpCompiler* compiler,

                               RegExpNode* on_success) {

  ZoneList<TextElement>* elms =

      compiler->zone()->New<ZoneList<TextElement>>(1, compiler->zone());

  elms->Add(TextElement::Atom(this), compiler->zone());

  return compiler->zone()->New<TextNode>(elms, compiler->read_backward(),

                                         on_success);

}


RegExpNode* RegExpText::ToNode(RegExpCompiler* compiler,

                               RegExpNode* on_success) {

  return compiler->zone()->New<TextNode>(elements(), compiler->read_backward(),

                                         on_success);

}


namespace {


bool CompareInverseRanges(ZoneList<CharacterRange>* ranges,

                          const int* special_class, int length) {

  length--;  // Remove final marker.


  DCHECK_EQ(kRangeEndMarker, special_class[length]);

  DCHECK_NE(0, ranges->length());

  DCHECK_NE(0, length);

  DCHECK_NE(0, special_class[0]);


  if (ranges->length() != (length >> 1) + 1) return false;


  CharacterRange range = ranges->at(0);

  if (range.from() != 0) return false;


  for (int i = 0; i < length; i += 2) {

    if (static_cast<base::uc32>(special_class[i]) != (range.to() + 1)) {

      return false;

    }

    range = ranges->at((i >> 1) + 1);

    if (static_cast<base::uc32>(special_class[i + 1]) != range.from()) {

      return false;

    }

  }


  return range.to() == kMaxCodePoint;

}


bool CompareRanges(ZoneList<CharacterRange>* ranges, const int* special_class,

                   int length) {

  length--;  // Remove final marker.


  DCHECK_EQ(kRangeEndMarker, special_class[length]);

  if (ranges->length() * 2 != length) return false;


  for (int i = 0; i < length; i += 2) {

    CharacterRange range = ranges->at(i >> 1);

    if (range.from() != static_cast<base::uc32>(special_class[i]) ||

        range.to() != static_cast<base::uc32>(special_class[i + 1] - 1)) {

      return false;

    }

  }

  return true;

}


}  // namespace


bool RegExpClassRanges::is_standard(Zone* zone) {

  // TODO(lrn): Remove need for this function, by not throwing away information

  // along the way.

  if (is_negated()) {

    return false;

  }

  if (set_.is_standard()) {

    return true;

  }

  if (CompareRanges(set_.ranges(zone), kSpaceRanges, kSpaceRangeCount)) {

    set_.set_standard_set_type(StandardCharacterSet::kWhitespace);

    return true;

  }

  if (CompareInverseRanges(set_.ranges(zone), kSpaceRanges, kSpaceRangeCount)) {

    set_.set_standard_set_type(StandardCharacterSet::kNotWhitespace);

    return true;

  }

  if (CompareInverseRanges(set_.ranges(zone), kLineTerminatorRanges,

                           kLineTerminatorRangeCount)) {

    set_.set_standard_set_type(StandardCharacterSet::kNotLineTerminator);

    return true;

  }

  if (CompareRanges(set_.ranges(zone), kLineTerminatorRanges,

                    kLineTerminatorRangeCount)) {

    set_.set_standard_set_type(StandardCharacterSet::kLineTerminator);

    return true;

  }

  if (CompareRanges(set_.ranges(zone), kWordRanges, kWordRangeCount)) {

    set_.set_standard_set_type(StandardCharacterSet::kWord);

    return true;

  }

  if (CompareInverseRanges(set_.ranges(zone), kWordRanges, kWordRangeCount)) {

    set_.set_standard_set_type(StandardCharacterSet::kNotWord);

    return true;

  }

  return false;

}


UnicodeRangeSplitter::UnicodeRangeSplitter(ZoneList<CharacterRange>* base) {

  // The unicode range splitter categorizes given character ranges into:

  // - Code points from the BMP representable by one code unit.

  // - Code points outside the BMP that need to be split into

  // surrogate pairs.

  // - Lone lead surrogates.

  // - Lone trail surrogates.

  // Lone surrogates are valid code points, even though no actual characters.

  // They require special matching to make sure we do not split surrogate pairs.


  for (int i = 0; i < base->length(); i++) AddRange(base->at(i));

}


void UnicodeRangeSplitter::AddRange(CharacterRange range) {

  static constexpr base::uc32 kBmp1Start = 0;

  static constexpr base::uc32 kBmp1End = kLeadSurrogateStart - 1;

  static constexpr base::uc32 kBmp2Start = kTrailSurrogateEnd + 1;

  static constexpr base::uc32 kBmp2End = kNonBmpStart - 1;


  // Ends are all inclusive.

  static_assert(kBmp1Start == 0);

  static_assert(kBmp1Start < kBmp1End);

  static_assert(kBmp1End + 1 == kLeadSurrogateStart);

  static_assert(kLeadSurrogateStart < kLeadSurrogateEnd);

  static_assert(kLeadSurrogateEnd + 1 == kTrailSurrogateStart);

  static_assert(kTrailSurrogateStart < kTrailSurrogateEnd);

  static_assert(kTrailSurrogateEnd + 1 == kBmp2Start);

  static_assert(kBmp2Start < kBmp2End);

  static_assert(kBmp2End + 1 == kNonBmpStart);

  static_assert(kNonBmpStart < kNonBmpEnd);


  static constexpr base::uc32 kStarts[] = {

      kBmp1Start, kLeadSurrogateStart, kTrailSurrogateStart,

      kBmp2Start, kNonBmpStart,

  };


  static constexpr base::uc32 kEnds[] = {

      kBmp1End, kLeadSurrogateEnd, kTrailSurrogateEnd, kBmp2End, kNonBmpEnd,

  };


  CharacterRangeVector* const kTargets[] = {

      &bmp_, &lead_surrogates_, &trail_surrogates_, &bmp_, &non_bmp_,

  };


  static constexpr int kCount = arraysize(kStarts);

  static_assert(kCount == arraysize(kEnds));

  static_assert(kCount == arraysize(kTargets));


  for (int i = 0; i < kCount; i++) {

    if (kStarts[i] > range.to()) break;

    const base::uc32 from = std::max(kStarts[i], range.from());

    const base::uc32 to = std::min(kEnds[i], range.to());

    if (from > to) continue;

    kTargets[i]->emplace_back(CharacterRange::Range(from, to));

  }

}


namespace {


// Translates between new and old V8-isms (SmallVector, ZoneList).

ZoneList<CharacterRange>* ToCanonicalZoneList(

    const UnicodeRangeSplitter::CharacterRangeVector* v, Zone* zone) {

  if (v->empty()) return nullptr;


  ZoneList<CharacterRange>* result =

      zone->New<ZoneList<CharacterRange>>(static_cast<int>(v->size()), zone);

  for (size_t i = 0; i < v->size(); i++) {

    result->Add(v->at(i), zone);

  }


  CharacterRange::Canonicalize(result);

  return result;

}


void AddBmpCharacters(RegExpCompiler* compiler, ChoiceNode* result,

                      RegExpNode* on_success, UnicodeRangeSplitter* splitter) {

  ZoneList<CharacterRange>* bmp =

      ToCanonicalZoneList(splitter->bmp(), compiler->zone());

  if (bmp == nullptr) return;

  result->AddAlternative(GuardedAlternative(TextNode::CreateForCharacterRanges(

      compiler->zone(), bmp, compiler->read_backward(), on_success)));

}


using UC16Range = uint32_t;  // {from, to} packed into one uint32_t.

constexpr UC16Range ToUC16Range(base::uc16 from, base::uc16 to) {

  return (static_cast<uint32_t>(from) << 16) | to;

}

constexpr base::uc16 ExtractFrom(UC16Range r) {

  return static_cast<base::uc16>(r >> 16);

}

constexpr base::uc16 ExtractTo(UC16Range r) {

  return static_cast<base::uc16>(r);

}


void AddNonBmpSurrogatePairs(RegExpCompiler* compiler, ChoiceNode* result,

                             RegExpNode* on_success,

                             UnicodeRangeSplitter* splitter) {

  DCHECK(!compiler->one_byte());

  Zone* const zone = compiler->zone();

  ZoneList<CharacterRange>* non_bmp =

      ToCanonicalZoneList(splitter->non_bmp(), zone);

  if (non_bmp == nullptr) return;


  // Translate each 32-bit code point range into the corresponding 16-bit code

  // unit representation consisting of the lead- and trail surrogate.

  //

  // The generated alternatives are grouped by the leading surrogate to avoid

  // emitting excessive code. For example, for

  //

  //  { \ud800[\udc00-\udc01]

  //  , \ud800[\udc05-\udc06]

  //  }

  //

  // there's no need to emit matching code for the leading surrogate \ud800

  // twice. We also create a dedicated grouping for full trailing ranges, i.e.

  // [dc00-dfff].

  ZoneUnorderedMap<UC16Range, ZoneList<CharacterRange>*> grouped_by_leading(

      zone);

  ZoneList<CharacterRange>* leading_with_full_trailing_range =

      zone->New<ZoneList<CharacterRange>>(1, zone);

  const auto AddRange = [&](base::uc16 from_l, base::uc16 to_l,

                            base::uc16 from_t, base::uc16 to_t) {

    const UC16Range leading_range = ToUC16Range(from_l, to_l);

    if (grouped_by_leading.count(leading_range) == 0) {

      if (from_t == kTrailSurrogateStart && to_t == kTrailSurrogateEnd) {

        leading_with_full_trailing_range->Add(

            CharacterRange::Range(from_l, to_l), zone);

        return;

      }

      grouped_by_leading[leading_range] =

          zone->New<ZoneList<CharacterRange>>(2, zone);

    }

    grouped_by_leading[leading_range]->Add(CharacterRange::Range(from_t, to_t),

                                           zone);

  };


  // First, create the grouped ranges.

  CharacterRange::Canonicalize(non_bmp);

  for (int i = 0; i < non_bmp->length(); i++) {

    // Match surrogate pair.

    // E.g. [\u10005-\u11005] becomes

    //      \ud800[\udc05-\udfff]|

    //      [\ud801-\ud803][\udc00-\udfff]|

    //      \ud804[\udc00-\udc05]

    base::uc32 from = non_bmp->at(i).from();

    base::uc32 to = non_bmp->at(i).to();

    base::uc16 from_l = unibrow::Utf16::LeadSurrogate(from);

    base::uc16 from_t = unibrow::Utf16::TrailSurrogate(from);

    base::uc16 to_l = unibrow::Utf16::LeadSurrogate(to);

    base::uc16 to_t = unibrow::Utf16::TrailSurrogate(to);


    if (from_l == to_l) {

      // The lead surrogate is the same.

      AddRange(from_l, to_l, from_t, to_t);

      continue;

    }


    if (from_t != kTrailSurrogateStart) {

      // Add [from_l][from_t-\udfff].

      AddRange(from_l, from_l, from_t, kTrailSurrogateEnd);

      from_l++;

    }

    if (to_t != kTrailSurrogateEnd) {

      // Add [to_l][\udc00-to_t].

      AddRange(to_l, to_l, kTrailSurrogateStart, to_t);

      to_l--;

    }

    if (from_l <= to_l) {

      // Add [from_l-to_l][\udc00-\udfff].

      AddRange(from_l, to_l, kTrailSurrogateStart, kTrailSurrogateEnd);

    }

  }


  // Create the actual TextNode now that ranges are fully grouped.

  if (!leading_with_full_trailing_range->is_empty()) {

    CharacterRange::Canonicalize(leading_with_full_trailing_range);

    result->AddAlternative(GuardedAlternative(TextNode::CreateForSurrogatePair(

        zone, leading_with_full_trailing_range,

        CharacterRange::Range(kTrailSurrogateStart, kTrailSurrogateEnd),

        compiler->read_backward(), on_success)));

  }

  for (const auto& it : grouped_by_leading) {

    CharacterRange leading_range =

        CharacterRange::Range(ExtractFrom(it.first), ExtractTo(it.first));

    ZoneList<CharacterRange>* trailing_ranges = it.second;

    CharacterRange::Canonicalize(trailing_ranges);

    result->AddAlternative(GuardedAlternative(TextNode::CreateForSurrogatePair(

        zone, leading_range, trailing_ranges, compiler->read_backward(),

        on_success)));

  }

}


RegExpNode* NegativeLookaroundAgainstReadDirectionAndMatch(

    RegExpCompiler* compiler, ZoneList<CharacterRange>* lookbehind,

    ZoneList<CharacterRange>* match, RegExpNode* on_success,

    bool read_backward) {

  Zone* zone = compiler->zone();

  RegExpNode* match_node = TextNode::CreateForCharacterRanges(

      zone, match, read_backward, on_success);

  int stack_register = compiler->UnicodeLookaroundStackRegister();

  int position_register = compiler->UnicodeLookaroundPositionRegister();

  RegExpLookaround::Builder lookaround(false, match_node, stack_register,

                                       position_register);

  RegExpNode* negative_match = TextNode::CreateForCharacterRanges(

      zone, lookbehind, !read_backward, lookaround.on_match_success());

  return lookaround.ForMatch(negative_match);

}


RegExpNode* MatchAndNegativeLookaroundInReadDirection(

    RegExpCompiler* compiler, ZoneList<CharacterRange>* match,

    ZoneList<CharacterRange>* lookahead, RegExpNode* on_success,

    bool read_backward) {

  Zone* zone = compiler->zone();

  int stack_register = compiler->UnicodeLookaroundStackRegister();

  int position_register = compiler->UnicodeLookaroundPositionRegister();

  RegExpLookaround::Builder lookaround(false, on_success, stack_register,

                                       position_register);

  RegExpNode* negative_match = TextNode::CreateForCharacterRanges(

      zone, lookahead, read_backward, lookaround.on_match_success());

  return TextNode::CreateForCharacterRanges(

      zone, match, read_backward, lookaround.ForMatch(negative_match));

}


void AddLoneLeadSurrogates(RegExpCompiler* compiler, ChoiceNode* result,

                           RegExpNode* on_success,

                           UnicodeRangeSplitter* splitter) {

  ZoneList<CharacterRange>* lead_surrogates =

      ToCanonicalZoneList(splitter->lead_surrogates(), compiler->zone());

  if (lead_surrogates == nullptr) return;

  Zone* zone = compiler->zone();

  // E.g. \ud801 becomes \ud801(?![\udc00-\udfff]).

  ZoneList<CharacterRange>* trail_surrogates = CharacterRange::List(

      zone, CharacterRange::Range(kTrailSurrogateStart, kTrailSurrogateEnd));


  RegExpNode* match;

  if (compiler->read_backward()) {

    // Reading backward. Assert that reading forward, there is no trail

    // surrogate, and then backward match the lead surrogate.

    match = NegativeLookaroundAgainstReadDirectionAndMatch(

        compiler, trail_surrogates, lead_surrogates, on_success, true);

  } else {

    // Reading forward. Forward match the lead surrogate and assert that

    // no trail surrogate follows.

    match = MatchAndNegativeLookaroundInReadDirection(

        compiler, lead_surrogates, trail_surrogates, on_success, false);

  }

  result->AddAlternative(GuardedAlternative(match));

}


void AddLoneTrailSurrogates(RegExpCompiler* compiler, ChoiceNode* result,

                            RegExpNode* on_success,

                            UnicodeRangeSplitter* splitter) {

  ZoneList<CharacterRange>* trail_surrogates =

      ToCanonicalZoneList(splitter->trail_surrogates(), compiler->zone());

  if (trail_surrogates == nullptr) return;

  Zone* zone = compiler->zone();

  // E.g. \udc01 becomes (?<![\ud800-\udbff])\udc01

  ZoneList<CharacterRange>* lead_surrogates = CharacterRange::List(

      zone, CharacterRange::Range(kLeadSurrogateStart, kLeadSurrogateEnd));


  RegExpNode* match;

  if (compiler->read_backward()) {

    // Reading backward. Backward match the trail surrogate and assert that no

    // lead surrogate precedes it.

    match = MatchAndNegativeLookaroundInReadDirection(

        compiler, trail_surrogates, lead_surrogates, on_success, true);

  } else {

    // Reading forward. Assert that reading backward, there is no lead

    // surrogate, and then forward match the trail surrogate.

    match = NegativeLookaroundAgainstReadDirectionAndMatch(

        compiler, lead_surrogates, trail_surrogates, on_success, false);

  }

  result->AddAlternative(GuardedAlternative(match));

}


RegExpNode* UnanchoredAdvance(RegExpCompiler* compiler,

                              RegExpNode* on_success) {

  // This implements ES2015 21.2.5.2.3, AdvanceStringIndex.

  DCHECK(!compiler->read_backward());

  Zone* zone = compiler->zone();

  // Advance any character. If the character happens to be a lead surrogate and

  // we advanced into the middle of a surrogate pair, it will work out, as

  // nothing will match from there. We will have to advance again, consuming

  // the associated trail surrogate.

  ZoneList<CharacterRange>* range =

      CharacterRange::List(zone, CharacterRange::Range(0, kMaxUtf16CodeUnit));

  return TextNode::CreateForCharacterRanges(zone, range, false, on_success);

}


}  // namespace


// static

// Only for /ui and /vi, not for /i regexps.


void CharacterRange::AddUnicodeCaseEquivalents(ZoneList<CharacterRange>* ranges,

                                               Zone* zone) {

#ifdef V8_INTL_SUPPORT

  DCHECK(IsCanonical(ranges));


  // Micro-optimization to avoid passing large ranges to UnicodeSet::closeOver.

  // See also https://crbug.com/v8/6727.

  // TODO(jgruber): This only covers the special case of the {0,0x10FFFF} range,

  // which we use frequently internally. But large ranges can also easily be

  // created by the user. We might want to have a more general caching mechanism

  // for such ranges.

  if (ranges->length() == 1 && ranges->at(0).IsEverything(kNonBmpEnd)) return;


  // Use ICU to compute the case fold closure over the ranges.

  icu::UnicodeSet set;

  for (int i = 0; i < ranges->length(); i++) {

    set.add(ranges->at(i).from(), ranges->at(i).to());

  }

  // Clear the ranges list without freeing the backing store.

  ranges->Rewind(0);

  set.closeOver(USET_SIMPLE_CASE_INSENSITIVE);

  for (int i = 0; i < set.getRangeCount(); i++) {

    ranges->Add(Range(set.getRangeStart(i), set.getRangeEnd(i)), zone);

  }

  // No errors and everything we collected have been ranges.

  Canonicalize(ranges);

#endif  // V8_INTL_SUPPORT

}


RegExpNode* RegExpClassRanges::ToNode(RegExpCompiler* compiler,

                                      RegExpNode* on_success) {

  set_.Canonicalize();

  Zone* const zone = compiler->zone();

  ZoneList<CharacterRange>* ranges = this->ranges(zone);


  const bool needs_case_folding =

      NeedsUnicodeCaseEquivalents(compiler->flags()) && !is_case_folded();

  if (needs_case_folding) {

    CharacterRange::AddUnicodeCaseEquivalents(ranges, zone);

  }


  if (!IsEitherUnicode(compiler->flags()) || compiler->one_byte() ||

      contains_split_surrogate()) {

    return zone->New<TextNode>(this, compiler->read_backward(), on_success);

  }


  if (is_negated()) {

    // With /v, character classes are never negated.

    // https://tc39.es/ecma262/#sec-compileatom

    // Atom :: CharacterClass

    //   4. Assert: cc.[[Invert]] is false.

    // Instead the complement is created when evaluating the class set.

    // The only exception is the "nothing range" (negated everything), which is

    // internally created for an empty set.

    DCHECK_IMPLIES(

        IsUnicodeSets(compiler->flags()),

        ranges->length() == 1 && ranges->first().IsEverything(kMaxCodePoint));

    ZoneList<CharacterRange>* negated =

        zone->New<ZoneList<CharacterRange>>(2, zone);

    CharacterRange::Negate(ranges, negated, zone);

    ranges = negated;

  }


  if (ranges->length() == 0) {

    // The empty character class is used as a 'fail' node.

    RegExpClassRanges* fail = zone->New<RegExpClassRanges>(zone, ranges);

    return zone->New<TextNode>(fail, compiler->read_backward(), on_success);

  }


  if (set_.is_standard() &&

      standard_type() == StandardCharacterSet::kEverything) {

    return UnanchoredAdvance(compiler, on_success);

  }


  // Split ranges in order to handle surrogates correctly:

  // - Surrogate pairs: translate the 32-bit code point into two uc16 code

  //   units (irregexp operates only on code units).

  // - Lone surrogates: these require lookarounds to ensure we don't match in

  //   the middle of a surrogate pair.

  ChoiceNode* result = zone->New<ChoiceNode>(2, zone);

  UnicodeRangeSplitter splitter(ranges);

  AddBmpCharacters(compiler, result, on_success, &splitter);

  AddNonBmpSurrogatePairs(compiler, result, on_success, &splitter);

  AddLoneLeadSurrogates(compiler, result, on_success, &splitter);

  AddLoneTrailSurrogates(compiler, result, on_success, &splitter);


  static constexpr int kMaxRangesToInline = 32;  // Arbitrary.

  if (ranges->length() > kMaxRangesToInline) result->SetDoNotInline();


  return result;

}


RegExpNode* RegExpClassSetOperand::ToNode(RegExpCompiler* compiler,

                                          RegExpNode* on_success) {

  Zone* zone = compiler->zone();

  const int size = (has_strings() ? static_cast<int>(strings()->size()) : 0) +

                   (ranges()->is_empty() ? 0 : 1);

  if (size == 0) {

    // If neither ranges nor strings are present, the operand is equal to an

    // empty range (matching nothing).

    ZoneList<CharacterRange>* empty =

        zone->template New<ZoneList<CharacterRange>>(0, zone);

    return zone->template New<RegExpClassRanges>(zone, empty)

        ->ToNode(compiler, on_success);

  }

  ZoneList<RegExpTree*>* alternatives =

      zone->template New<ZoneList<RegExpTree*>>(size, zone);

  // Strings are sorted by length first (larger strings before shorter ones).

  // See the comment on CharacterClassStrings.

  // Empty strings (if present) are added after character ranges.

  RegExpTree* empty_string = nullptr;

  if (has_strings()) {

    for (auto string : *strings()) {

      if (string.second->IsEmpty()) {

        empty_string = string.second;

      } else {

        alternatives->Add(string.second, zone);

      }

    }

  }

  if (!ranges()->is_empty()) {

    // In unicode sets mode case folding has to be done at precise locations

    // (e.g. before building complements).

    // It is therefore the parsers responsibility to case fold (sub-) ranges

    // before creating ClassSetOperands.

    alternatives->Add(zone->template New<RegExpClassRanges>(

                          zone, ranges(), RegExpClassRanges::IS_CASE_FOLDED),

                      zone);

  }

  if (empty_string != nullptr) {

    alternatives->Add(empty_string, zone);

  }


  RegExpTree* node = nullptr;

  if (size == 1) {

    DCHECK_EQ(alternatives->length(), 1);

    node = alternatives->first();

  } else {

    node = zone->template New<RegExpDisjunction>(alternatives);

  }

  return node->ToNode(compiler, on_success);

}


RegExpNode* RegExpClassSetExpression::ToNode(RegExpCompiler* compiler,

                                             RegExpNode* on_success) {

  Zone* zone = compiler->zone();

  ZoneList<CharacterRange>* temp_ranges =

      zone->template New<ZoneList<CharacterRange>>(4, zone);

  RegExpClassSetOperand* root = ComputeExpression(this, temp_ranges, zone);

  return root->ToNode(compiler, on_success);

}


void RegExpClassSetOperand::Union(RegExpClassSetOperand* other, Zone* zone) {

  ranges()->AddAll(*other->ranges(), zone);

  if (other->has_strings()) {

    if (strings_ == nullptr) {

      strings_ = zone->template New<CharacterClassStrings>(zone);

    }

    strings()->insert(other->strings()->begin(), other->strings()->end());

  }

}


void RegExpClassSetOperand::Intersect(RegExpClassSetOperand* other,

                                      ZoneList<CharacterRange>* temp_ranges,

                                      Zone* zone) {

  CharacterRange::Intersect(ranges(), other->ranges(), temp_ranges, zone);

  std::swap(*ranges(), *temp_ranges);

  temp_ranges->Rewind(0);

  if (has_strings()) {

    if (!other->has_strings()) {

      strings()->clear();

    } else {

      for (auto iter = strings()->begin(); iter != strings()->end();) {

        if (other->strings()->find(iter->first) == other->strings()->end()) {

          iter = strings()->erase(iter);

        } else {

          iter++;

        }

      }

    }

  }

}


void RegExpClassSetOperand::Subtract(RegExpClassSetOperand* other,

                                     ZoneList<CharacterRange>* temp_ranges,

                                     Zone* zone) {

  CharacterRange::Subtract(ranges(), other->ranges(), temp_ranges, zone);

  std::swap(*ranges(), *temp_ranges);

  temp_ranges->Rewind(0);

  if (has_strings() && other->has_strings()) {

    for (auto iter = strings()->begin(); iter != strings()->end();) {

      if (other->strings()->find(iter->first) != other->strings()->end()) {

        iter = strings()->erase(iter);

      } else {

        iter++;

      }

    }

  }

}


// static


RegExpClassSetOperand* RegExpClassSetExpression::ComputeExpression(

    RegExpTree* root, ZoneList<CharacterRange>* temp_ranges, Zone* zone) {

  DCHECK(temp_ranges->is_empty());

  if (root->IsClassSetOperand()) {

    return root->AsClassSetOperand();

  }

  DCHECK(root->IsClassSetExpression());

  RegExpClassSetExpression* node = root->AsClassSetExpression();

  RegExpClassSetOperand* result =

      ComputeExpression(node->operands()->at(0), temp_ranges, zone);

  switch (node->operation()) {

    case OperationType::kUnion: {

      for (int i = 1; i < node->operands()->length(); i++) {

        RegExpClassSetOperand* op =

            ComputeExpression(node->operands()->at(i), temp_ranges, zone);

        result->Union(op, zone);

      }

      CharacterRange::Canonicalize(result->ranges());

      break;

    }

    case OperationType::kIntersection: {

      for (int i = 1; i < node->operands()->length(); i++) {

        RegExpClassSetOperand* op =

            ComputeExpression(node->operands()->at(i), temp_ranges, zone);

        result->Intersect(op, temp_ranges, zone);

      }

      break;

    }

    case OperationType::kSubtraction: {

      for (int i = 1; i < node->operands()->length(); i++) {

        RegExpClassSetOperand* op =

            ComputeExpression(node->operands()->at(i), temp_ranges, zone);

        result->Subtract(op, temp_ranges, zone);

      }

      break;

    }

  }

  if (node->is_negated()) {

    DCHECK(!result->has_strings());

    CharacterRange::Negate(result->ranges(), temp_ranges, zone);

    std::swap(*result->ranges(), *temp_ranges);

    temp_ranges->Rewind(0);

    node->is_negated_ = false;

  }

  // Store the result as single operand of the current node.

  node->operands()->Set(0, result);

  node->operands()->Rewind(1);


  return result;

}


namespace {


int CompareCharAt(RegExpAtom* a, int index_a, RegExpAtom* b, int index_b) {

  base::uc16 character1 = a->data().at(index_a);

  base::uc16 character2 = b->data().at(index_b);

  if (character1 < character2) return -1;

  if (character1 > character2) return 1;

  return 0;

}


int CompareFirstChar(RegExpTree* const* a, RegExpTree* const* b) {

  RegExpAtom* atom1 = (*a)->AsAtom();

  RegExpAtom* atom2 = (*b)->AsAtom();

  return CompareCharAt(atom1, 0, atom2, 0);

}


int CompareLastChar(RegExpTree* const* a, RegExpTree* const* b) {

  RegExpAtom* atom1 = (*a)->AsAtom();

  RegExpAtom* atom2 = (*b)->AsAtom();

  return CompareCharAt(atom1, atom1->length() - 1, atom2, atom2->length() - 1);

}


#ifdef V8_INTL_SUPPORT


int CompareCaseInsensitive(const icu::UnicodeString& a,

                           const icu::UnicodeString& b) {

  return a.caseCompare(b, U_FOLD_CASE_DEFAULT);

}


int CompareCharAtCaseInsensitive(RegExpAtom* a, int index_a, RegExpAtom* b,

                                 int index_b) {

  base::uc16 character1 = a->data().at(index_a);

  base::uc16 character2 = b->data().at(index_b);

  return CompareCaseInsensitive(icu::UnicodeString{character1},

                                icu::UnicodeString{character2});

}


int CompareFirstCharCaseInsensitive(RegExpTree* const* a,

                                    RegExpTree* const* b) {

  RegExpAtom* atom1 = (*a)->AsAtom();

  RegExpAtom* atom2 = (*b)->AsAtom();

  return CompareCharAtCaseInsensitive(atom1, 0, atom2, 0);

}


int CompareLastCharCaseInsensitive(RegExpTree* const* a, RegExpTree* const* b) {

  RegExpAtom* atom1 = (*a)->AsAtom();

  RegExpAtom* atom2 = (*b)->AsAtom();

  return CompareCharAtCaseInsensitive(atom1, atom1->length() - 1, atom2,

                                      atom2->length() - 1);

}


bool Equals(bool ignore_case, const icu::UnicodeString& a,

            const icu::UnicodeString& b) {

  if (a == b) return true;

  if (ignore_case) return CompareCaseInsensitive(a, b) == 0;

  return false;  // Case-sensitive equality already checked above.

}


bool CharAtEquals(bool ignore_case, const RegExpAtom* a, int index_a,

                  const RegExpAtom* b, int index_b) {

  return Equals(ignore_case, a->data().at(index_a), b->data().at(index_b));

}


#else


unibrow::uchar Canonical(

    unibrow::Mapping<unibrow::Ecma262Canonicalize>* canonicalize,

    unibrow::uchar c) {

  unibrow::uchar chars[unibrow::Ecma262Canonicalize::kMaxWidth];

  int length = canonicalize->get(c, '\0', chars);

  DCHECK_LE(length, 1);

  unibrow::uchar canonical = c;

  if (length == 1) canonical = chars[0];

  return canonical;

}


int CompareCaseInsensitive(

    unibrow::Mapping<unibrow::Ecma262Canonicalize>* canonicalize,

    unibrow::uchar a, unibrow::uchar b) {

  if (a == b) return 0;

  if (a >= 'a' || b >= 'a') {

    a = Canonical(canonicalize, a);

    b = Canonical(canonicalize, b);

  }

  return static_cast<int>(a) - static_cast<int>(b);

}


int CompareCharAtCaseInsensitive(

    unibrow::Mapping<unibrow::Ecma262Canonicalize>* canonicalize, RegExpAtom* a,

    int index_a, RegExpAtom* b, int index_b) {

  base::uc16 character1 = a->data().at(index_a);

  base::uc16 character2 = b->data().at(index_b);

  return CompareCaseInsensitive(canonicalize, character1, character2);

}

int CompareFirstCharCaseInsensitive(

    unibrow::Mapping<unibrow::Ecma262Canonicalize>* canonicalize,

    RegExpTree* const* a, RegExpTree* const* b) {

  RegExpAtom* atom1 = (*a)->AsAtom();

  RegExpAtom* atom2 = (*b)->AsAtom();

  return CompareCharAtCaseInsensitive(canonicalize, atom1, 0, atom2, 0);

}


int CompareLastCharCaseInsensitive(

    unibrow::Mapping<unibrow::Ecma262Canonicalize>* canonicalize,

    RegExpTree* const* a, RegExpTree* const* b) {

  RegExpAtom* atom1 = (*a)->AsAtom();

  RegExpAtom* atom2 = (*b)->AsAtom();

  return CompareCharAtCaseInsensitive(canonicalize, atom1, atom1->length() - 1,

                                      atom2, atom2->length() - 1);

}


bool Equals(bool ignore_case,

            unibrow::Mapping<unibrow::Ecma262Canonicalize>* canonicalize,

            unibrow::uchar a, unibrow::uchar b) {

  if (a == b) return true;

  if (ignore_case) {

    return CompareCaseInsensitive(canonicalize, a, b) == 0;

  }

  return false;  // Case-sensitive equality already checked above.

}


bool CharAtEquals(bool ignore_case,

                  unibrow::Mapping<unibrow::Ecma262Canonicalize>* canonicalize,

                  const RegExpAtom* a, int index_a, const RegExpAtom* b,

                  int index_b) {

  return Equals(ignore_case, canonicalize, a->data().at(index_a),

                b->data().at(index_b));

}


#endif  // V8_INTL_SUPPORT


}  // namespace


// We can stable sort runs of atoms, since the order does not matter if they

// start with different characters when reading forwards, or end with different

// characters when reading backwards.

// Returns true if any consecutive atoms were found.


bool RegExpDisjunction::SortConsecutiveAtoms(RegExpCompiler* compiler) {

  ZoneList<RegExpTree*>* alternatives = this->alternatives();

  int length = alternatives->length();

  bool found_consecutive_atoms = false;

  for (int i = 0; i < length; i++) {

    while (i < length) {

      RegExpTree* alternative = alternatives->at(i);

      if (alternative->IsAtom()) break;

      i++;

    }

    // i is length or it is the index of an atom.

    if (i == length) break;

    int first_atom = i;

    i++;

    while (i < length) {

      RegExpTree* alternative = alternatives->at(i);

      if (!alternative->IsAtom()) break;

      i++;

    }

    // Sort atoms to get ones with common prefixes together.

    // This step is more tricky if we are in a case-independent regexp,

    // because it would change /is|I/ to /I|is/, and order matters when

    // the regexp parts don't match only disjoint starting points. To fix

    // this we have a version of CompareFirstChar that uses case-

    // independent character classes for comparison.

    DCHECK_LT(first_atom, alternatives->length());

    DCHECK_LE(i, alternatives->length());

    DCHECK_LE(first_atom, i);

    const bool backwards = compiler->read_backward();

    if (IsIgnoreCase(compiler->flags())) {

#ifdef V8_INTL_SUPPORT

      auto cmp_fun = backwards ? CompareLastCharCaseInsensitive

                               : CompareFirstCharCaseInsensitive;

      alternatives->StableSort(cmp_fun, first_atom, i - first_atom);

#else

      unibrow::Mapping<unibrow::Ecma262Canonicalize>* canonicalize =

          compiler->isolate()->regexp_macro_assembler_canonicalize();

      auto compare_closure = [canonicalize, backwards](RegExpTree* const* a,

                                                       RegExpTree* const* b) {

        return backwards ? CompareLastCharCaseInsensitive(canonicalize, a, b)

                         : CompareFirstCharCaseInsensitive(canonicalize, a, b);

      };

      alternatives->StableSort(compare_closure, first_atom, i - first_atom);

#endif  // V8_INTL_SUPPORT

    } else {

      auto cmp_fun = backwards ? CompareLastChar : CompareFirstChar;

      alternatives->StableSort(cmp_fun, first_atom, i - first_atom);

    }

    if (i - first_atom > 1) found_consecutive_atoms = true;

  }

  return found_consecutive_atoms;

}


// Optimizes a common prefix when reading forwards, or suffix when reading

// backwards. E.g. turns ab|ac|ad into a(?:b|c|d).


void RegExpDisjunction::RationalizeConsecutiveAtoms(RegExpCompiler* compiler) {

  Zone* zone = compiler->zone();

  const bool backwards = compiler->read_backward();

  ZoneList<RegExpTree*>* alternatives = this->alternatives();

  int length = alternatives->length();

  const bool ignore_case = IsIgnoreCase(compiler->flags());


  int write_posn = 0;

  int i = 0;

  while (i < length) {

    RegExpTree* alternative = alternatives->at(i);

    if (!alternative->IsAtom()) {

      alternatives->at(write_posn++) = alternatives->at(i);

      i++;

      continue;

    }

    RegExpAtom* const atom = alternative->AsAtom();

#ifdef V8_INTL_SUPPORT

    icu::UnicodeString common_affix(

        atom->data().at(backwards ? atom->length() - 1 : 0));

#else

    unibrow::Mapping<unibrow::Ecma262Canonicalize>* const canonicalize =

        compiler->isolate()->regexp_macro_assembler_canonicalize();

    unibrow::uchar common_affix =

        atom->data().at(backwards ? atom->length() - 1 : 0);

    if (ignore_case) {

      common_affix = Canonical(canonicalize, common_affix);

    }

#endif  // V8_INTL_SUPPORT

    int first_with_affix = i;

    int affix_length = atom->length();

    i++;

    while (i < length) {

      alternative = alternatives->at(i);

      if (!alternative->IsAtom()) break;

      RegExpAtom* const alt_atom = alternative->AsAtom();

#ifdef V8_INTL_SUPPORT

      icu::UnicodeString new_affix(

          alt_atom->data().at(backwards ? alt_atom->length() - 1 : 0));

      if (!Equals(ignore_case, new_affix, common_affix)) break;

#else

      unibrow::uchar new_affix =

          alt_atom->data().at(backwards ? alt_atom->length() - 1 : 0);

      if (!Equals(ignore_case, canonicalize, new_affix, common_affix)) break;

#endif  // V8_INTL_SUPPORT

      affix_length = std::min(affix_length, alt_atom->length());

      i++;

    }

    if (i > first_with_affix + 2) {

      // Found worthwhile run of alternatives with common affix of at least one

      // character.  The sorting function above did not sort on more than one

      // character for reasons of correctness, but there may still be a longer

      // common affix if the terms were similar or presorted in the input.

      // Find out how long the common affix is.

      int run_length = i - first_with_affix;

      RegExpAtom* const alt_atom = alternatives->at(first_with_affix)->AsAtom();

      for (int j = 1; j < run_length && affix_length > 1; j++) {

        RegExpAtom* old_atom = alternatives->at(j + first_with_affix)->AsAtom();

        for (int k = 1; k < affix_length; k++) {

          const int alt_atom_pos = backwards ? alt_atom->length() - k : k;

          const int old_atom_pos = backwards ? old_atom->length() - k : k;

#ifdef V8_INTL_SUPPORT

          if (!CharAtEquals(ignore_case, alt_atom, alt_atom_pos, old_atom,

                            old_atom_pos)) {

#else

          if (!CharAtEquals(ignore_case, canonicalize, alt_atom, alt_atom_pos,

                            old_atom, old_atom_pos)) {

#endif  // V8_INTL_SUPPORT

            affix_length = k;

            break;

          }

        }

      }

      const int common_start =

          backwards ? alt_atom->length() - affix_length : 0;

      RegExpAtom* common = zone->New<RegExpAtom>(alt_atom->data().SubVector(

          common_start, common_start + affix_length));

      ZoneList<RegExpTree*>* distinct =

          zone->New<ZoneList<RegExpTree*>>(run_length, zone);

      for (int j = 0; j < run_length; j++) {

        RegExpAtom* old_atom = alternatives->at(j + first_with_affix)->AsAtom();

        int len = old_atom->length();

        if (len == affix_length) {

          distinct->Add(zone->New<RegExpEmpty>(), zone);

        } else {

          const int distinct_start = backwards ? 0 : affix_length;

          const int distinct_end = backwards ? old_atom->length() - affix_length

                                             : old_atom->length();

          RegExpTree* part = zone->New<RegExpAtom>(

              old_atom->data().SubVector(distinct_start, distinct_end));

          distinct->Add(part, zone);

        }

      }

      ZoneList<RegExpTree*>* pair = zone->New<ZoneList<RegExpTree*>>(2, zone);

      if (backwards) {

        pair->Add(zone->New<RegExpDisjunction>(distinct), zone);

        pair->Add(common, zone);

      } else {

        pair->Add(common, zone);

        pair->Add(zone->New<RegExpDisjunction>(distinct), zone);

      }

      alternatives->at(write_posn++) = zone->New<RegExpAlternative>(pair);

    } else {

      // Just copy any non-worthwhile alternatives.

      for (int j = first_with_affix; j < i; j++) {

        alternatives->at(write_posn++) = alternatives->at(j);

      }

    }

  }

  alternatives->Rewind(write_posn);  // Trim end of array.

}


// Optimizes b|c|z to [bcz].


void RegExpDisjunction::FixSingleCharacterDisjunctions(

    RegExpCompiler* compiler) {

  Zone* zone = compiler->zone();

  ZoneList<RegExpTree*>* alternatives = this->alternatives();

  int length = alternatives->length();


  int write_posn = 0;

  int i = 0;

  while (i < length) {

    RegExpTree* alternative = alternatives->at(i);

    if (!alternative->IsAtom()) {

      alternatives->at(write_posn++) = alternatives->at(i);

      i++;

      continue;

    }

    RegExpAtom* const atom = alternative->AsAtom();

    if (atom->length() != 1) {

      alternatives->at(write_posn++) = alternatives->at(i);

      i++;

      continue;

    }

    const RegExpFlags flags = compiler->flags();

    DCHECK_IMPLIES(IsEitherUnicode(flags),

                   !unibrow::Utf16::IsLeadSurrogate(atom->data().at(0)));

    bool contains_trail_surrogate =

        unibrow::Utf16::IsTrailSurrogate(atom->data().at(0));

    int first_in_run = i;

    i++;

    // Find a run of single-character atom alternatives that have identical

    // flags (case independence and unicode-ness).

    while (i < length) {

      alternative = alternatives->at(i);

      if (!alternative->IsAtom()) break;

      RegExpAtom* const alt_atom = alternative->AsAtom();

      if (alt_atom->length() != 1) break;

      DCHECK_IMPLIES(IsEitherUnicode(flags),

                     !unibrow::Utf16::IsLeadSurrogate(alt_atom->data().at(0)));

      contains_trail_surrogate |=

          unibrow::Utf16::IsTrailSurrogate(alt_atom->data().at(0));

      i++;

    }

    if (i > first_in_run + 1) {

      // Found non-trivial run of single-character alternatives.

      int run_length = i - first_in_run;

      ZoneList<CharacterRange>* ranges =

          zone->New<ZoneList<CharacterRange>>(2, zone);

      for (int j = 0; j < run_length; j++) {

        RegExpAtom* old_atom = alternatives->at(j + first_in_run)->AsAtom();

        DCHECK_EQ(old_atom->length(), 1);

        ranges->Add(CharacterRange::Singleton(old_atom->data().at(0)), zone);

      }

      RegExpClassRanges::ClassRangesFlags class_ranges_flags;

      if (IsEitherUnicode(flags) && contains_trail_surrogate) {

        class_ranges_flags = RegExpClassRanges::CONTAINS_SPLIT_SURROGATE;

      }

      alternatives->at(write_posn++) =

          zone->New<RegExpClassRanges>(zone, ranges, class_ranges_flags);

    } else {

      // Just copy any trivial alternatives.

      for (int j = first_in_run; j < i; j++) {

        alternatives->at(write_posn++) = alternatives->at(j);

      }

    }

  }

  alternatives->Rewind(write_posn);  // Trim end of array.

}


RegExpNode* RegExpDisjunction::ToNode(RegExpCompiler* compiler,

                                      RegExpNode* on_success) {

  compiler->ToNodeMaybeCheckForStackOverflow();


  ZoneList<RegExpTree*>* alternatives = this->alternatives();


  if (alternatives->length() > 2) {

    bool found_consecutive_atoms = SortConsecutiveAtoms(compiler);

    if (found_consecutive_atoms) RationalizeConsecutiveAtoms(compiler);

    FixSingleCharacterDisjunctions(compiler);

    if (alternatives->length() == 1) {

      return alternatives->at(0)->ToNode(compiler, on_success);

    }

  }


  int length = alternatives->length();


  ChoiceNode* result =

      compiler->zone()->New<ChoiceNode>(length, compiler->zone());

  for (int i = 0; i < length; i++) {

    GuardedAlternative alternative(

        alternatives->at(i)->ToNode(compiler, on_success));

    result->AddAlternative(alternative);

  }

  return result;

}


RegExpNode* RegExpQuantifier::ToNode(RegExpCompiler* compiler,

                                     RegExpNode* on_success) {

  return ToNode(min(), max(), is_greedy(), body(), compiler, on_success);

}


namespace {

// Desugar \b to (?<=\w)(?=\W)|(?<=\W)(?=\w) and

//         \B to (?<=\w)(?=\w)|(?<=\W)(?=\W)

RegExpNode* BoundaryAssertionAsLookaround(RegExpCompiler* compiler,

                                          RegExpNode* on_success,

                                          RegExpAssertion::Type type) {

  CHECK(NeedsUnicodeCaseEquivalents(compiler->flags()));

  Zone* zone = compiler->zone();

  ZoneList<CharacterRange>* word_range =

      zone->New<ZoneList<CharacterRange>>(2, zone);

  CharacterRange::AddClassEscape(StandardCharacterSet::kWord, word_range, true,

                                 zone);

  int stack_register = compiler->UnicodeLookaroundStackRegister();

  int position_register = compiler->UnicodeLookaroundPositionRegister();

  ChoiceNode* result = zone->New<ChoiceNode>(2, zone);

  // Add two choices. The (non-)boundary could start with a word or

  // a non-word-character.

  for (int i = 0; i < 2; i++) {

    bool lookbehind_for_word = i == 0;

    bool lookahead_for_word =

        (type == RegExpAssertion::Type::BOUNDARY) ^ lookbehind_for_word;

    // Look to the left.

    RegExpLookaround::Builder lookbehind(lookbehind_for_word, on_success,

                                         stack_register, position_register);

    RegExpNode* backward = TextNode::CreateForCharacterRanges(

        zone, word_range, true, lookbehind.on_match_success());

    // Look to the right.

    RegExpLookaround::Builder lookahead(lookahead_for_word,

                                        lookbehind.ForMatch(backward),

                                        stack_register, position_register);

    RegExpNode* forward = TextNode::CreateForCharacterRanges(

        zone, word_range, false, lookahead.on_match_success());

    result->AddAlternative(GuardedAlternative(lookahead.ForMatch(forward)));

  }

  return result;

}

}  // anonymous namespace


RegExpNode* RegExpAssertion::ToNode(RegExpCompiler* compiler,

                                    RegExpNode* on_success) {

  NodeInfo info;

  Zone* zone = compiler->zone();


  switch (assertion_type()) {

    case Type::START_OF_LINE:

      return AssertionNode::AfterNewline(on_success);

    case Type::START_OF_INPUT:

      return AssertionNode::AtStart(on_success);

    case Type::BOUNDARY:

      return NeedsUnicodeCaseEquivalents(compiler->flags())

                 ? BoundaryAssertionAsLookaround(compiler, on_success,

                                                 Type::BOUNDARY)

                 : AssertionNode::AtBoundary(on_success);

    case Type::NON_BOUNDARY:

      return NeedsUnicodeCaseEquivalents(compiler->flags())

                 ? BoundaryAssertionAsLookaround(compiler, on_success,

                                                 Type::NON_BOUNDARY)

                 : AssertionNode::AtNonBoundary(on_success);

    case Type::END_OF_INPUT:

      return AssertionNode::AtEnd(on_success);

    case Type::END_OF_LINE: {

      // Compile $ in multiline regexps as an alternation with a positive

      // lookahead in one side and an end-of-input on the other side.

      // We need two registers for the lookahead.

      int stack_pointer_register = compiler->AllocateRegister();

      int position_register = compiler->AllocateRegister();

      // The ChoiceNode to distinguish between a newline and end-of-input.

      ChoiceNode* result = zone->New<ChoiceNode>(2, zone);

      // Create a newline atom.

      ZoneList<CharacterRange>* newline_ranges =

          zone->New<ZoneList<CharacterRange>>(3, zone);

      CharacterRange::AddClassEscape(StandardCharacterSet::kLineTerminator,

                                     newline_ranges, false, zone);

      RegExpClassRanges* newline_atom =

          zone->New<RegExpClassRanges>(StandardCharacterSet::kLineTerminator);

      ActionNode* submatch_success = ActionNode::PositiveSubmatchSuccess(

          stack_pointer_register, position_register,

          0,   // No captures inside.

          -1,  // Ignored if no captures.

          on_success);

      TextNode* newline_matcher =

          zone->New<TextNode>(newline_atom, false, submatch_success);

      // Create an end-of-input matcher.

      RegExpNode* end_of_line = ActionNode::BeginPositiveSubmatch(

          stack_pointer_register, position_register, newline_matcher,

          submatch_success);

      // Add the two alternatives to the ChoiceNode.

      GuardedAlternative eol_alternative(end_of_line);

      result->AddAlternative(eol_alternative);

      GuardedAlternative end_alternative(AssertionNode::AtEnd(on_success));

      result->AddAlternative(end_alternative);

      return result;

    }

    default:

      UNREACHABLE();

  }

}


RegExpNode* RegExpBackReference::ToNode(RegExpCompiler* compiler,

                                        RegExpNode* on_success) {

  RegExpNode* backref_node = on_success;

  // Only one of the captures in the list can actually match. Since

  // back-references to unmatched captures are treated as empty, we can simply

  // create back-references to all possible captures.

  for (auto capture : *captures()) {

    backref_node = compiler->zone()->New<BackReferenceNode>(

        RegExpCapture::StartRegister(capture->index()),

        RegExpCapture::EndRegister(capture->index()), compiler->read_backward(),

        backref_node);

  }

  return backref_node;

}


RegExpNode* RegExpEmpty::ToNode(RegExpCompiler* compiler,

                                RegExpNode* on_success) {

  return on_success;

}


namespace {


class V8_NODISCARD ModifiersScope {

 public:

  ModifiersScope(RegExpCompiler* compiler, RegExpFlags flags)

      : compiler_(compiler), previous_flags_(compiler->flags()) {

    compiler->set_flags(flags);

  }

  ~ModifiersScope() { compiler_->set_flags(previous_flags_); }


 private:

  RegExpCompiler* compiler_;

  const RegExpFlags previous_flags_;

};


}  // namespace


RegExpNode* RegExpGroup::ToNode(RegExpCompiler* compiler,

                                RegExpNode* on_success) {

  // If no flags are modified, simply convert and return the body.

  if (flags() == compiler->flags()) {

    return body_->ToNode(compiler, on_success);

  }

  // Reset flags for successor node.

  const RegExpFlags old_flags = compiler->flags();

  on_success = ActionNode::ModifyFlags(old_flags, on_success);


  // Convert body using modifier.

  ModifiersScope modifiers_scope(compiler, flags());

  RegExpNode* body = body_->ToNode(compiler, on_success);


  // Wrap body into modifier node.

  RegExpNode* modified_body = ActionNode::ModifyFlags(flags(), body);

  return modified_body;

}


RegExpLookaround::Builder::Builder(bool is_positive, RegExpNode* on_success,

                                   int stack_pointer_register,

                                   int position_register,

                                   int capture_register_count,

                                   int capture_register_start)

    : is_positive_(is_positive),

      on_success_(on_success),

      stack_pointer_register_(stack_pointer_register),

      position_register_(position_register) {

  if (is_positive_) {

    on_match_success_ = ActionNode::PositiveSubmatchSuccess(

        stack_pointer_register, position_register, capture_register_count,

        capture_register_start, on_success_);

  } else {

    Zone* zone = on_success_->zone();

    on_match_success_ = zone->New<NegativeSubmatchSuccess>(

        stack_pointer_register, position_register, capture_register_count,

        capture_register_start, zone);

  }

}


RegExpNode* RegExpLookaround::Builder::ForMatch(RegExpNode* match) {

  if (is_positive_) {

    ActionNode* on_match_success = on_match_success_->AsActionNode();

    return ActionNode::BeginPositiveSubmatch(

        stack_pointer_register_, position_register_, match, on_match_success);

  } else {

    Zone* zone = on_success_->zone();

    // We use a ChoiceNode to represent the negative lookaround. The first

    // alternative is the negative match. On success, the end node backtracks.

    // On failure, the second alternative is tried and leads to success.

    // NegativeLookaroundChoiceNode is a special ChoiceNode that ignores the

    // first exit when calculating quick checks.

    ChoiceNode* choice_node = zone->New<NegativeLookaroundChoiceNode>(

        GuardedAlternative(match), GuardedAlternative(on_success_), zone);

    return ActionNode::BeginNegativeSubmatch(stack_pointer_register_,

                                             position_register_, choice_node);

  }

}


RegExpNode* RegExpLookaround::ToNode(RegExpCompiler* compiler,

                                     RegExpNode* on_success) {

  compiler->ToNodeMaybeCheckForStackOverflow();


  int stack_pointer_register = compiler->AllocateRegister();

  int position_register = compiler->AllocateRegister();


  const int registers_per_capture = 2;

  const int register_of_first_capture = 2;

  int register_count = capture_count_ * registers_per_capture;

  int register_start =

      register_of_first_capture + capture_from_ * registers_per_capture;


  RegExpNode* result;

  bool was_reading_backward = compiler->read_backward();

  compiler->set_read_backward(type() == LOOKBEHIND);

  Builder builder(is_positive(), on_success, stack_pointer_register,

                  position_register, register_count, register_start);

  RegExpNode* match = body_->ToNode(compiler, builder.on_match_success());

  result = builder.ForMatch(match);

  compiler->set_read_backward(was_reading_backward);

  return result;

}


RegExpNode* RegExpCapture::ToNode(RegExpCompiler* compiler,

                                  RegExpNode* on_success) {

  return ToNode(body(), index(), compiler, on_success);

}


RegExpNode* RegExpCapture::ToNode(RegExpTree* body, int index,

                                  RegExpCompiler* compiler,

                                  RegExpNode* on_success) {

  DCHECK_NOT_NULL(body);

  int start_reg = RegExpCapture::StartRegister(index);

  int end_reg = RegExpCapture::EndRegister(index);

  if (compiler->read_backward()) std::swap(start_reg, end_reg);

  RegExpNode* store_end = ActionNode::StorePosition(end_reg, true, on_success);

  RegExpNode* body_node = body->ToNode(compiler, store_end);

  return ActionNode::StorePosition(start_reg, true, body_node);

}


namespace {


class AssertionSequenceRewriter final {

 public:

  // TODO(jgruber): Consider moving this to a separate AST tree rewriter pass

  // instead of sprinkling rewrites into the AST->Node conversion process.

  static void MaybeRewrite(ZoneList<RegExpTree*>* terms, Zone* zone) {

    AssertionSequenceRewriter rewriter(terms, zone);


    static constexpr int kNoIndex = -1;

    int from = kNoIndex;


    for (int i = 0; i < terms->length(); i++) {

      RegExpTree* t = terms->at(i);

      if (from == kNoIndex && t->IsAssertion()) {

        from = i;  // Start a sequence.

      } else if (from != kNoIndex && !t->IsAssertion()) {

        // Terminate and process the sequence.

        if (i - from > 1) rewriter.Rewrite(from, i);

        from = kNoIndex;

      }

    }


    if (from != kNoIndex && terms->length() - from > 1) {

      rewriter.Rewrite(from, terms->length());

    }

  }


  // All assertions are zero width. A consecutive sequence of assertions is

  // order-independent. There's two ways we can optimize here:

  // 1. fold all identical assertions.

  // 2. if any assertion combinations are known to fail (e.g. \b\B), the entire

  //    sequence fails.

  void Rewrite(int from, int to) {

    DCHECK_GT(to, from + 1);


    // Bitfield of all seen assertions.

    uint32_t seen_assertions = 0;

    static_assert(static_cast<int>(RegExpAssertion::Type::LAST_ASSERTION_TYPE) <

                  kUInt32Size * kBitsPerByte);


    for (int i = from; i < to; i++) {

      RegExpAssertion* t = terms_->at(i)->AsAssertion();

      const uint32_t bit = 1 << static_cast<int>(t->assertion_type());


      if (seen_assertions & bit) {

        // Fold duplicates.

        terms_->Set(i, zone_->New<RegExpEmpty>());

      }


      seen_assertions |= bit;

    }


    // Collapse failures.

    const uint32_t always_fails_mask =

        1 << static_cast<int>(RegExpAssertion::Type::BOUNDARY) |

        1 << static_cast<int>(RegExpAssertion::Type::NON_BOUNDARY);

    if ((seen_assertions & always_fails_mask) == always_fails_mask) {

      ReplaceSequenceWithFailure(from, to);

    }

  }


  void ReplaceSequenceWithFailure(int from, int to) {

    // Replace the entire sequence with a single node that always fails.

    // TODO(jgruber): Consider adding an explicit Fail kind. Until then, the

    // negated '*' (everything) range serves the purpose.

    ZoneList<CharacterRange>* ranges =

        zone_->New<ZoneList<CharacterRange>>(0, zone_);

    RegExpClassRanges* cc = zone_->New<RegExpClassRanges>(zone_, ranges);

    terms_->Set(from, cc);


    // Zero out the rest.

    RegExpEmpty* empty = zone_->New<RegExpEmpty>();

    for (int i = from + 1; i < to; i++) terms_->Set(i, empty);

  }


 private:

  AssertionSequenceRewriter(ZoneList<RegExpTree*>* terms, Zone* zone)

      : zone_(zone), terms_(terms) {}


  Zone* zone_;

  ZoneList<RegExpTree*>* terms_;

};


}  // namespace


RegExpNode* RegExpAlternative::ToNode(RegExpCompiler* compiler,

                                      RegExpNode* on_success) {

  compiler->ToNodeMaybeCheckForStackOverflow();


  ZoneList<RegExpTree*>* children = nodes();


  AssertionSequenceRewriter::MaybeRewrite(children, compiler->zone());


  RegExpNode* current = on_success;

  if (compiler->read_backward()) {

    for (int i = 0; i < children->length(); i++) {

      current = children->at(i)->ToNode(compiler, current);

    }

  } else {

    for (int i = children->length() - 1; i >= 0; i--) {

      current = children->at(i)->ToNode(compiler, current);

    }

  }

  return current;

}


namespace {


void AddClass(const int* elmv, int elmc, ZoneList<CharacterRange>* ranges,

              Zone* zone) {

  elmc--;

  DCHECK_EQ(kRangeEndMarker, elmv[elmc]);

  for (int i = 0; i < elmc; i += 2) {

    DCHECK(elmv[i] < elmv[i + 1]);

    ranges->Add(CharacterRange::Range(elmv[i], elmv[i + 1] - 1), zone);

  }

}


void AddClassNegated(const int* elmv, int elmc,

                     ZoneList<CharacterRange>* ranges, Zone* zone) {

  elmc--;

  DCHECK_EQ(kRangeEndMarker, elmv[elmc]);

  DCHECK_NE(0x0000, elmv[0]);

  DCHECK_NE(kMaxCodePoint, elmv[elmc - 1]);

  base::uc16 last = 0x0000;

  for (int i = 0; i < elmc; i += 2) {

    DCHECK(last <= elmv[i] - 1);

    DCHECK(elmv[i] < elmv[i + 1]);

    ranges->Add(CharacterRange::Range(last, elmv[i] - 1), zone);

    last = elmv[i + 1];

  }

  ranges->Add(CharacterRange::Range(last, kMaxCodePoint), zone);

}


}  // namespace


void CharacterRange::AddClassEscape(StandardCharacterSet standard_character_set,

                                    ZoneList<CharacterRange>* ranges,

                                    bool add_unicode_case_equivalents,

                                    Zone* zone) {

  if (add_unicode_case_equivalents &&

      (standard_character_set == StandardCharacterSet::kWord ||

       standard_character_set == StandardCharacterSet::kNotWord)) {

    // See #sec-runtime-semantics-wordcharacters-abstract-operation

    // In case of unicode and ignore_case, we need to create the closure over

    // case equivalent characters before negating.

    ZoneList<CharacterRange>* new_ranges =

        zone->New<ZoneList<CharacterRange>>(2, zone);

    AddClass(kWordRanges, kWordRangeCount, new_ranges, zone);

    AddUnicodeCaseEquivalents(new_ranges, zone);

    if (standard_character_set == StandardCharacterSet::kNotWord) {

      ZoneList<CharacterRange>* negated =

          zone->New<ZoneList<CharacterRange>>(2, zone);

      CharacterRange::Negate(new_ranges, negated, zone);

      new_ranges = negated;

    }

    ranges->AddAll(*new_ranges, zone);

    return;

  }


  switch (standard_character_set) {

    case StandardCharacterSet::kWhitespace:

      AddClass(kSpaceRanges, kSpaceRangeCount, ranges, zone);

      break;

    case StandardCharacterSet::kNotWhitespace:

      AddClassNegated(kSpaceRanges, kSpaceRangeCount, ranges, zone);

      break;

    case StandardCharacterSet::kWord:

      AddClass(kWordRanges, kWordRangeCount, ranges, zone);

      break;

    case StandardCharacterSet::kNotWord:

      AddClassNegated(kWordRanges, kWordRangeCount, ranges, zone);

      break;

    case StandardCharacterSet::kDigit:

      AddClass(kDigitRanges, kDigitRangeCount, ranges, zone);

      break;

    case StandardCharacterSet::kNotDigit:

      AddClassNegated(kDigitRanges, kDigitRangeCount, ranges, zone);

      break;

    // This is the set of characters matched by the $ and ^ symbols

    // in multiline mode.

    case StandardCharacterSet::kLineTerminator:

      AddClass(kLineTerminatorRanges, kLineTerminatorRangeCount, ranges, zone);

      break;

    case StandardCharacterSet::kNotLineTerminator:

      AddClassNegated(kLineTerminatorRanges, kLineTerminatorRangeCount, ranges,

                      zone);

      break;

    // This is not a character range as defined by the spec but a

    // convenient shorthand for a character class that matches any

    // character.

    case StandardCharacterSet::kEverything:

      ranges->Add(CharacterRange::Everything(), zone);

      break;

  }

}


// static

// Only for /i, not for /ui or /vi.


void CharacterRange::AddCaseEquivalents(Isolate* isolate, Zone* zone,

                                        ZoneList<CharacterRange>* ranges,

                                        bool is_one_byte) {

  CharacterRange::Canonicalize(ranges);

  int range_count = ranges->length();

#ifdef V8_INTL_SUPPORT

  icu::UnicodeSet others;

  for (int i = 0; i < range_count; i++) {

    CharacterRange range = ranges->at(i);

    base::uc32 from = range.from();

    if (from > kMaxUtf16CodeUnit) continue;

    base::uc32 to = std::min({range.to(), kMaxUtf16CodeUnitU});

    // Nothing to be done for surrogates.

    if (from >= kLeadSurrogateStart && to <= kTrailSurrogateEnd) continue;

    if (is_one_byte && !RangeContainsLatin1Equivalents(range)) {

      if (from > String::kMaxOneByteCharCode) continue;

      if (to > String::kMaxOneByteCharCode) to = String::kMaxOneByteCharCode;

    }

    others.add(from, to);

  }


  // Compute the set of additional characters that should be added,

  // using UnicodeSet::closeOver. ECMA 262 defines slightly different

  // case-folding rules than Unicode, so some characters that are

  // added by closeOver do not match anything other than themselves in

  // JS. For example, 'ſ' (U+017F LATIN SMALL LETTER LONG S) is the

  // same case-insensitive character as 's' or 'S' according to

  // Unicode, but does not match any other character in JS. To handle

  // this case, we add such characters to the IgnoreSet and filter

  // them out. We filter twice: once before calling closeOver (to

  // prevent 'ſ' from adding 's'), and once after calling closeOver

  // (to prevent 's' from adding 'ſ'). See regexp/special-case.h for

  // more information.

  icu::UnicodeSet already_added(others);

  others.removeAll(RegExpCaseFolding::IgnoreSet());

  others.closeOver(USET_CASE_INSENSITIVE);

  others.removeAll(RegExpCaseFolding::IgnoreSet());

  others.removeAll(already_added);


  // Add others to the ranges

  for (int32_t i = 0; i < others.getRangeCount(); i++) {

    UChar32 from = others.getRangeStart(i);

    UChar32 to = others.getRangeEnd(i);

    if (from == to) {

      ranges->Add(CharacterRange::Singleton(from), zone);

    } else {

      ranges->Add(CharacterRange::Range(from, to), zone);

    }

  }

#else

  for (int i = 0; i < range_count; i++) {

    CharacterRange range = ranges->at(i);

    base::uc32 bottom = range.from();

    if (bottom > kMaxUtf16CodeUnit) continue;

    base::uc32 top = std::min({range.to(), kMaxUtf16CodeUnitU});

    // Nothing to be done for surrogates.

    if (bottom >= kLeadSurrogateStart && top <= kTrailSurrogateEnd) continue;

    if (is_one_byte && !RangeContainsLatin1Equivalents(range)) {

      if (bottom > String::kMaxOneByteCharCode) continue;

      if (top > String::kMaxOneByteCharCode) top = String::kMaxOneByteCharCode;

    }

    unibrow::uchar chars[unibrow::Ecma262UnCanonicalize::kMaxWidth];

    if (top == bottom) {

      // If this is a singleton we just expand the one character.

      int length = isolate->jsregexp_uncanonicalize()->get(bottom, '\0', chars);

      for (int j = 0; j < length; j++) {

        base::uc32 chr = chars[j];

        if (chr != bottom) {

          ranges->Add(CharacterRange::Singleton(chars[j]), zone);

        }

      }

    } else {

      // If this is a range we expand the characters block by block, expanding

      // contiguous subranges (blocks) one at a time.  The approach is as

      // follows.  For a given start character we look up the remainder of the

      // block that contains it (represented by the end point), for instance we

      // find 'z' if the character is 'c'.  A block is characterized by the

      // property that all characters uncanonicalize in the same way, except

      // that each entry in the result is incremented by the distance from the

      // first element.  So a-z is a block because 'a' uncanonicalizes to ['a',

      // 'A'] and the k'th letter uncanonicalizes to ['a' + k, 'A' + k].  Once

      // we've found the end point we look up its uncanonicalization and

      // produce a range for each element.  For instance for [c-f] we look up

      // ['z', 'Z'] and produce [c-f] and [C-F].  We then only add a range if

      // it is not already contained in the input, so [c-f] will be skipped but

      // [C-F] will be added.  If this range is not completely contained in a

      // block we do this for all the blocks covered by the range (handling

      // characters that is not in a block as a "singleton block").

      unibrow::uchar equivalents[unibrow::Ecma262UnCanonicalize::kMaxWidth];

      base::uc32 pos = bottom;

      while (pos <= top) {

        int length =

            isolate->jsregexp_canonrange()->get(pos, '\0', equivalents);

        base::uc32 block_end;

        if (length == 0) {

          block_end = pos;

        } else {

          DCHECK_EQ(1, length);

          block_end = equivalents[0];

        }

        int end = (block_end > top) ? top : block_end;

        length = isolate->jsregexp_uncanonicalize()->get(block_end, '\0',

                                                         equivalents);

        for (int j = 0; j < length; j++) {

          base::uc32 c = equivalents[j];

          base::uc32 range_from = c - (block_end - pos);

          base::uc32 range_to = c - (block_end - end);

          if (!(bottom <= range_from && range_to <= top)) {

            ranges->Add(CharacterRange::Range(range_from, range_to), zone);

          }

        }

        pos = end + 1;

      }

    }

  }

#endif  // V8_INTL_SUPPORT

}


bool CharacterRange::IsCanonical(const ZoneList<CharacterRange>* ranges) {

  DCHECK_NOT_NULL(ranges);

  int n = ranges->length();

  if (n <= 1) return true;

  base::uc32 max = ranges->at(0).to();

  for (int i = 1; i < n; i++) {

    CharacterRange next_range = ranges->at(i);

    if (next_range.from() <= max + 1) return false;

    max = next_range.to();

  }

  return true;

}


ZoneList<CharacterRange>* CharacterSet::ranges(Zone* zone) {

  if (ranges_ == nullptr) {

    ranges_ = zone->New<ZoneList<CharacterRange>>(2, zone);

    CharacterRange::AddClassEscape(standard_set_type_.value(), ranges_, false,

                                   zone);

  }

  return ranges_;

}


namespace {


// Move a number of elements in a zonelist to another position

// in the same list. Handles overlapping source and target areas.

void MoveRanges(ZoneList<CharacterRange>* list, int from, int to, int count) {

  // Ranges are potentially overlapping.

  if (from < to) {

    for (int i = count - 1; i >= 0; i--) {

      list->at(to + i) = list->at(from + i);

    }

  } else {

    for (int i = 0; i < count; i++) {

      list->at(to + i) = list->at(from + i);

    }

  }

}


int InsertRangeInCanonicalList(ZoneList<CharacterRange>* list, int count,

                               CharacterRange insert) {

  // Inserts a range into list[0..count[, which must be sorted

  // by from value and non-overlapping and non-adjacent, using at most

  // list[0..count] for the result. Returns the number of resulting

  // canonicalized ranges. Inserting a range may collapse existing ranges into

  // fewer ranges, so the return value can be anything in the range 1..count+1.

  base::uc32 from = insert.from();

  base::uc32 to = insert.to();

  int start_pos = 0;

  int end_pos = count;

  for (int i = count - 1; i >= 0; i--) {

    CharacterRange current = list->at(i);

    if (current.from() > to + 1) {

      end_pos = i;

    } else if (current.to() + 1 < from) {

      start_pos = i + 1;

      break;

    }

  }


  // Inserted range overlaps, or is adjacent to, ranges at positions

  // [start_pos..end_pos[. Ranges before start_pos or at or after end_pos are

  // not affected by the insertion.

  // If start_pos == end_pos, the range must be inserted before start_pos.

  // if start_pos < end_pos, the entire range from start_pos to end_pos

  // must be merged with the insert range.


  if (start_pos == end_pos) {

    // Insert between existing ranges at position start_pos.

    if (start_pos < count) {

      MoveRanges(list, start_pos, start_pos + 1, count - start_pos);

    }

    list->at(start_pos) = insert;

    return count + 1;

  }

  if (start_pos + 1 == end_pos) {

    // Replace single existing range at position start_pos.

    CharacterRange to_replace = list->at(start_pos);

    int new_from = std::min(to_replace.from(), from);

    int new_to = std::max(to_replace.to(), to);

    list->at(start_pos) = CharacterRange::Range(new_from, new_to);

    return count;

  }

  // Replace a number of existing ranges from start_pos to end_pos - 1.

  // Move the remaining ranges down.


  int new_from = std::min(list->at(start_pos).from(), from);

  int new_to = std::max(list->at(end_pos - 1).to(), to);

  if (end_pos < count) {

    MoveRanges(list, end_pos, start_pos + 1, count - end_pos);

  }

  list->at(start_pos) = CharacterRange::Range(new_from, new_to);

  return count - (end_pos - start_pos) + 1;

}


}  // namespace


void CharacterSet::Canonicalize() {

  // Special/default classes are always considered canonical. The result

  // of calling ranges() will be sorted.

  if (ranges_ == nullptr) return;

  CharacterRange::Canonicalize(ranges_);

}


// static


void CharacterRange::Canonicalize(ZoneList<CharacterRange>* character_ranges) {

  if (character_ranges->length() <= 1) return;

  // Check whether ranges are already canonical (increasing, non-overlapping,

  // non-adjacent).

  int n = character_ranges->length();

  base::uc32 max = character_ranges->at(0).to();

  int i = 1;

  while (i < n) {

    CharacterRange current = character_ranges->at(i);

    if (current.from() <= max + 1) {

      break;

    }

    max = current.to();

    i++;

  }

  // Canonical until the i'th range. If that's all of them, we are done.

  if (i == n) return;


  // The ranges at index i and forward are not canonicalized. Make them so by

  // doing the equivalent of insertion sort (inserting each into the previous

  // list, in order).

  // Notice that inserting a range can reduce the number of ranges in the

  // result due to combining of adjacent and overlapping ranges.

  int read = i;           // Range to insert.

  int num_canonical = i;  // Length of canonicalized part of list.

  do {

    num_canonical = InsertRangeInCanonicalList(character_ranges, num_canonical,

                                               character_ranges->at(read));

    read++;

  } while (read < n);

  character_ranges->Rewind(num_canonical);


  DCHECK(CharacterRange::IsCanonical(character_ranges));

}


// static


void CharacterRange::Negate(const ZoneList<CharacterRange>* ranges,

                            ZoneList<CharacterRange>* negated_ranges,

                            Zone* zone) {

  DCHECK(CharacterRange::IsCanonical(ranges));

  DCHECK_EQ(0, negated_ranges->length());

  int range_count = ranges->length();

  base::uc32 from = 0;

  int i = 0;

  if (range_count > 0 && ranges->at(0).from() == 0) {

    from = ranges->at(0).to() + 1;

    i = 1;

  }

  while (i < range_count) {

    CharacterRange range = ranges->at(i);

    negated_ranges->Add(CharacterRange::Range(from, range.from() - 1), zone);

    from = range.to() + 1;

    i++;

  }

  if (from < kMaxCodePoint) {

    negated_ranges->Add(CharacterRange::Range(from, kMaxCodePoint), zone);

  }

}


// static


void CharacterRange::Intersect(const ZoneList<CharacterRange>* lhs,

                               const ZoneList<CharacterRange>* rhs,

                               ZoneList<CharacterRange>* intersection,

                               Zone* zone) {

  DCHECK(CharacterRange::IsCanonical(lhs));

  DCHECK(CharacterRange::IsCanonical(rhs));

  DCHECK_EQ(0, intersection->length());

  int lhs_index = 0;

  int rhs_index = 0;

  while (lhs_index < lhs->length() && rhs_index < rhs->length()) {

    // Skip non-overlapping ranges.

    if (lhs->at(lhs_index).to() < rhs->at(rhs_index).from()) {

      lhs_index++;

      continue;

    }

    if (rhs->at(rhs_index).to() < lhs->at(lhs_index).from()) {

      rhs_index++;

      continue;

    }


    base::uc32 from =

        std::max(lhs->at(lhs_index).from(), rhs->at(rhs_index).from());

    base::uc32 to = std::min(lhs->at(lhs_index).to(), rhs->at(rhs_index).to());

    intersection->Add(CharacterRange::Range(from, to), zone);

    if (to == lhs->at(lhs_index).to()) {

      lhs_index++;

    } else {

      rhs_index++;

    }

  }


  DCHECK(IsCanonical(intersection));

}


namespace {


// Advance |index| and set |from| and |to| to the new range, if not out of

// bounds of |range|, otherwise |from| is set to a code point beyond the legal

// unicode character range.

void SafeAdvanceRange(const ZoneList<CharacterRange>* range, int* index,

                      base::uc32* from, base::uc32* to) {

  ++(*index);

  if (*index < range->length()) {

    *from = range->at(*index).from();

    *to = range->at(*index).to();

  } else {

    *from = kMaxCodePoint + 1;

  }

}


}  // namespace


// static


void CharacterRange::Subtract(const ZoneList<CharacterRange>* src,

                              const ZoneList<CharacterRange>* to_remove,

                              ZoneList<CharacterRange>* result, Zone* zone) {

  DCHECK(CharacterRange::IsCanonical(src));

  DCHECK(CharacterRange::IsCanonical(to_remove));

  DCHECK_EQ(0, result->length());


  if (src->is_empty()) return;


  int src_index = 0;

  int to_remove_index = 0;

  base::uc32 from = src->at(src_index).from();

  base::uc32 to = src->at(src_index).to();

  while (src_index < src->length() && to_remove_index < to_remove->length()) {

    CharacterRange remove_range = to_remove->at(to_remove_index);

    if (remove_range.to() < from) {

      // (a) Non-overlapping case, ignore current to_remove range.

      //            |-------|

      // |-------|

      to_remove_index++;

    } else if (to < remove_range.from()) {

      // (b) Non-overlapping case, add full current range to result.

      // |-------|

      //            |-------|

      result->Add(CharacterRange::Range(from, to), zone);

      SafeAdvanceRange(src, &src_index, &from, &to);

    } else if (from >= remove_range.from() && to <= remove_range.to()) {

      // (c) Current to_remove range fully covers current range.

      //   |---|

      // |-------|

      SafeAdvanceRange(src, &src_index, &from, &to);

    } else if (from < remove_range.from() && to > remove_range.to()) {

      // (d) Split current range.

      // |-------|

      //   |---|

      result->Add(CharacterRange::Range(from, remove_range.from() - 1), zone);

      from = remove_range.to() + 1;

      to_remove_index++;

    } else if (from < remove_range.from()) {

      // (e) End current range.

      // |-------|

      //    |-------|

      to = remove_range.from() - 1;

      result->Add(CharacterRange::Range(from, to), zone);

      SafeAdvanceRange(src, &src_index, &from, &to);

    } else if (to > remove_range.to()) {

      // (f) Modify start of current range.

      //    |-------|

      // |-------|

      from = remove_range.to() + 1;

      to_remove_index++;

    } else {

      UNREACHABLE();

    }

  }

  // The last range needs special treatment after |to_remove| is exhausted, as

  // |from| might have been modified by the last |to_remove| range and |to| was

  // not yet known (i.e. cases d and f).

  if (from <= to) {

    result->Add(CharacterRange::Range(from, to), zone);

  }

  src_index++;


  // Add remaining ranges after |to_remove| is exhausted.

  for (; src_index < src->length(); src_index++) {

    result->Add(src->at(src_index), zone);

  }


  DCHECK(IsCanonical(result));

}


// static


void CharacterRange::ClampToOneByte(ZoneList<CharacterRange>* ranges) {

  DCHECK(IsCanonical(ranges));


  // Drop all ranges that don't contain one-byte code units, and clamp the last

  // range s.t. it likewise only contains one-byte code units. Note this relies

  // on `ranges` being canonicalized, i.e. sorted and non-overlapping.


  static constexpr base::uc32 max_char = String::kMaxOneByteCharCodeU;

  int n = ranges->length();

  for (; n > 0; n--) {

    CharacterRange& r = ranges->at(n - 1);

    if (r.from() <= max_char) {

      r.to_ = std::min(r.to_, max_char);

      break;

    }

  }


  ranges->Rewind(n);

}


// static


bool CharacterRange::Equals(const ZoneList<CharacterRange>* lhs,

                            const ZoneList<CharacterRange>* rhs) {

  DCHECK(IsCanonical(lhs));

  DCHECK(IsCanonical(rhs));

  if (lhs->length() != rhs->length()) return false;


  for (int i = 0; i < lhs->length(); i++) {

    if (lhs->at(i) != rhs->at(i)) return false;

  }


  return true;

}


namespace {


// Scoped object to keep track of how much we unroll quantifier loops in the

// regexp graph generator.

class RegExpExpansionLimiter {

 public:

  static const int kMaxExpansionFactor = 6;

  RegExpExpansionLimiter(RegExpCompiler* compiler, int factor)

      : compiler_(compiler),

        saved_expansion_factor_(compiler->current_expansion_factor()),

        ok_to_expand_(saved_expansion_factor_ <= kMaxExpansionFactor) {

    DCHECK_LT(0, factor);

    if (ok_to_expand_) {

      if (factor > kMaxExpansionFactor) {

        // Avoid integer overflow of the current expansion factor.

        ok_to_expand_ = false;

        compiler->set_current_expansion_factor(kMaxExpansionFactor + 1);

      } else {

        int new_factor = saved_expansion_factor_ * factor;

        ok_to_expand_ = (new_factor <= kMaxExpansionFactor);

        compiler->set_current_expansion_factor(new_factor);

      }

    }

  }


  ~RegExpExpansionLimiter() {

    compiler_->set_current_expansion_factor(saved_expansion_factor_);

  }


  bool ok_to_expand() { return ok_to_expand_; }


 private:

  RegExpCompiler* compiler_;

  int saved_expansion_factor_;

  bool ok_to_expand_;


  DISALLOW_IMPLICIT_CONSTRUCTORS(RegExpExpansionLimiter);

};


}  // namespace


RegExpNode* RegExpQuantifier::ToNode(int min, int max, bool is_greedy,

                                     RegExpTree* body, RegExpCompiler* compiler,

                                     RegExpNode* on_success,

                                     bool not_at_start) {

  // x{f, t} becomes this:

  //

  //             (r++)<-.

  //               |     `

  //               |     (x)

  //               v     ^

  //      (r=0)-->(?)---/ [if r < t]

  //               |

  //   [if r >= f] \----> ...

  //


  // 15.10.2.5 RepeatMatcher algorithm.

  // The parser has already eliminated the case where max is 0.  In the case

  // where max_match is zero the parser has removed the quantifier if min was

  // > 0 and removed the atom if min was 0.  See AddQuantifierToAtom.


  // If we know that we cannot match zero length then things are a little

  // simpler since we don't need to make the special zero length match check

  // from step 2.1.  If the min and max are small we can unroll a little in

  // this case.

  static const int kMaxUnrolledMinMatches = 3;  // Unroll (foo)+ and (foo){3,}

  static const int kMaxUnrolledMaxMatches = 3;  // Unroll (foo)? and (foo){x,3}

  if (max == 0) return on_success;  // This can happen due to recursion.

  bool body_can_be_empty = (body->min_match() == 0);

  int body_start_reg = RegExpCompiler::kNoRegister;

  Interval capture_registers = body->CaptureRegisters();

  bool needs_capture_clearing = !capture_registers.is_empty();

  Zone* zone = compiler->zone();


  if (body_can_be_empty) {

    body_start_reg = compiler->AllocateRegister();

  } else if (compiler->optimize() && !needs_capture_clearing) {

    // Only unroll if there are no captures and the body can't be

    // empty.

    {

      RegExpExpansionLimiter limiter(compiler, min + ((max != min) ? 1 : 0));

      if (min > 0 && min <= kMaxUnrolledMinMatches && limiter.ok_to_expand()) {

        int new_max = (max == kInfinity) ? max : max - min;

        // Recurse once to get the loop or optional matches after the fixed

        // ones.

        RegExpNode* answer =

            ToNode(0, new_max, is_greedy, body, compiler, on_success, true);

        // Unroll the forced matches from 0 to min.  This can cause chains of

        // TextNodes (which the parser does not generate).  These should be

        // combined if it turns out they hinder good code generation.

        for (int i = 0; i < min; i++) {

          answer = body->ToNode(compiler, answer);

        }

        return answer;

      }

    }

    if (max <= kMaxUnrolledMaxMatches && min == 0) {

      DCHECK_LT(0, max);  // Due to the 'if' above.

      RegExpExpansionLimiter limiter(compiler, max);

      if (limiter.ok_to_expand()) {

        // Unroll the optional matches up to max.

        RegExpNode* answer = on_success;

        for (int i = 0; i < max; i++) {

          ChoiceNode* alternation = zone->New<ChoiceNode>(2, zone);

          if (is_greedy) {

            alternation->AddAlternative(

                GuardedAlternative(body->ToNode(compiler, answer)));

            alternation->AddAlternative(GuardedAlternative(on_success));

          } else {

            alternation->AddAlternative(GuardedAlternative(on_success));

            alternation->AddAlternative(

                GuardedAlternative(body->ToNode(compiler, answer)));

          }

          answer = alternation;

          if (not_at_start && !compiler->read_backward()) {

            alternation->set_not_at_start();

          }

        }

        return answer;

      }

    }

  }

  bool has_min = min > 0;

  bool has_max = max < RegExpTree::kInfinity;

  bool needs_counter = has_min || has_max;

  int reg_ctr = needs_counter ? compiler->AllocateRegister()

                              : RegExpCompiler::kNoRegister;

  LoopChoiceNode* center = zone->New<LoopChoiceNode>(

      body->min_match() == 0, compiler->read_backward(), min, zone);

  if (not_at_start && !compiler->read_backward()) center->set_not_at_start();

  RegExpNode* loop_return =

      needs_counter ? static_cast<RegExpNode*>(

                          ActionNode::IncrementRegister(reg_ctr, center))

                    : static_cast<RegExpNode*>(center);

  if (body_can_be_empty) {

    // If the body can be empty we need to check if it was and then

    // backtrack.

    loop_return =

        ActionNode::EmptyMatchCheck(body_start_reg, reg_ctr, min, loop_return);

  }

  RegExpNode* body_node = body->ToNode(compiler, loop_return);

  if (body_can_be_empty) {

    // If the body can be empty we need to store the start position

    // so we can bail out if it was empty.

    body_node = ActionNode::StorePosition(body_start_reg, false, body_node);

  }

  if (needs_capture_clearing) {

    // Before entering the body of this loop we need to clear captures.

    body_node = ActionNode::ClearCaptures(capture_registers, body_node);

  }

  GuardedAlternative body_alt(body_node);

  if (has_max) {

    Guard* body_guard = zone->New<Guard>(reg_ctr, Guard::LT, max);

    body_alt.AddGuard(body_guard, zone);

  }

  GuardedAlternative rest_alt(on_success);

  if (has_min) {

    Guard* rest_guard = compiler->zone()->New<Guard>(reg_ctr, Guard::GEQ, min);

    rest_alt.AddGuard(rest_guard, zone);

  }

  if (is_greedy) {

    center->AddLoopAlternative(body_alt);

    center->AddContinueAlternative(rest_alt);

  } else {

    center->AddContinueAlternative(rest_alt);

    center->AddLoopAlternative(body_alt);

  }

  if (needs_counter) {

    return ActionNode::SetRegisterForLoop(reg_ctr, 0, center);

  } else {

    return center;

  }

}


}  // namespace internal

}  // namespace v8

Zone
friend Zone
Definition asm-types.cc:195

top
ThreadLocalTop * top
Definition bootstrapper.cc:5248

pos
SourcePosition pos
Definition class-debug-reader-generator.cc:34

length

unibrow::Mapping
Definition unicode.h:70

unibrow::Mapping::get
int get(uchar c, uchar n, uchar *result)
Definition unicode-inl.h:32

unibrow::Utf16::LeadSurrogate
static uint16_t LeadSurrogate(uint32_t char_code)
Definition unicode.h:126

unibrow::Utf16::TrailSurrogate
static uint16_t TrailSurrogate(uint32_t char_code)
Definition unicode.h:129

unibrow::Utf16::IsTrailSurrogate
static bool IsTrailSurrogate(int code)
Definition unicode.h:109

unibrow::Utf16::IsLeadSurrogate
static bool IsLeadSurrogate(int code)
Definition unicode.h:106

v8::base::Flags< RegExpFlag >

v8::base::SmallVector< CharacterRange, kInitialSize >

v8::base::SmallVector::size
size_t size() const
Definition small-vector.h:144

v8::base::SmallVector::emplace_back
void emplace_back(Args &&... args)
Definition small-vector.h:184

v8::base::SmallVector::at
T & at(size_t index)
Definition small-vector.h:166

v8::base::SmallVector::empty
bool empty() const
Definition small-vector.h:145

v8::base::Vector::at
const T & at(size_t index) const
Definition vector.h:81

v8::internal::ActionNode
Definition regexp-nodes.h:325

v8::internal::ActionNode::AsActionNode
ActionNode * AsActionNode() override
Definition regexp-nodes.h:360

v8::internal::ActionNode::EmptyMatchCheck
static ActionNode * EmptyMatchCheck(int start_register, int repetition_register, int repetition_limit, RegExpNode *on_success)
Definition regexp-compiler.cc:704

v8::internal::ActionNode::ModifyFlags
static ActionNode * ModifyFlags(RegExpFlags flags, RegExpNode *on_success)
Definition regexp-compiler.cc:716

v8::internal::ActionNode::StorePosition
static ActionNode * StorePosition(int reg, bool is_capture, RegExpNode *on_success)
Definition regexp-compiler.cc:654

v8::internal::ActionNode::PositiveSubmatchSuccess
static ActionNode * PositiveSubmatchSuccess(int stack_pointer_reg, int restore_reg, int clear_capture_count, int clear_capture_from, RegExpNode *on_success)
Definition regexp-compiler.cc:691

v8::internal::ActionNode::BeginNegativeSubmatch
static ActionNode * BeginNegativeSubmatch(int stack_pointer_reg, int position_reg, RegExpNode *on_success)
Definition regexp-compiler.cc:682

v8::internal::ActionNode::SetRegisterForLoop
static ActionNode * SetRegisterForLoop(int reg, int val, RegExpNode *on_success)
Definition regexp-compiler.cc:638

v8::internal::ActionNode::IncrementRegister
static ActionNode * IncrementRegister(int reg, RegExpNode *on_success)
Definition regexp-compiler.cc:647

v8::internal::ActionNode::BeginPositiveSubmatch
static ActionNode * BeginPositiveSubmatch(int stack_pointer_reg, int position_reg, RegExpNode *body, ActionNode *success_node)
Definition regexp-compiler.cc:671

v8::internal::ActionNode::ClearCaptures
static ActionNode * ClearCaptures(Interval range, RegExpNode *on_success)
Definition regexp-compiler.cc:663

v8::internal::AssertionNode::AtStart
static AssertionNode * AtStart(RegExpNode *on_success)
Definition regexp-nodes.h:497

v8::internal::AssertionNode::AtEnd
static AssertionNode * AtEnd(RegExpNode *on_success)
Definition regexp-nodes.h:494

v8::internal::AssertionNode::AfterNewline
static AssertionNode * AfterNewline(RegExpNode *on_success)
Definition regexp-nodes.h:506

v8::internal::CharacterRange
Definition regexp-ast.h:96

v8::internal::CharacterRange::Equals
static bool Equals(const ZoneList< CharacterRange > *lhs, const ZoneList< CharacterRange > *rhs)
Definition regexp-compiler-tonode.cc:1947

v8::internal::CharacterRange::Canonicalize
static void Canonicalize(ZoneList< CharacterRange > *ranges)
Definition regexp-compiler-tonode.cc:1741

v8::internal::CharacterRange::AddUnicodeCaseEquivalents
static void AddUnicodeCaseEquivalents(ZoneList< CharacterRange > *ranges, Zone *zone)
Definition regexp-compiler-tonode.cc:427

v8::internal::CharacterRange::from
base::uc32 from() const
Definition regexp-ast.h:140

v8::internal::CharacterRange::Subtract
static void Subtract(const ZoneList< CharacterRange > *src, const ZoneList< CharacterRange > *to_remove, ZoneList< CharacterRange > *dst, Zone *zone)
Definition regexp-compiler-tonode.cc:1854

v8::internal::CharacterRange::Negate
static void Negate(const ZoneList< CharacterRange > *src, ZoneList< CharacterRange > *dst, Zone *zone)
Definition regexp-compiler-tonode.cc:1777

v8::internal::CharacterRange::to
base::uc32 to() const
Definition regexp-ast.h:141

v8::internal::CharacterRange::AddCaseEquivalents
static V8_EXPORT_PRIVATE void AddCaseEquivalents(Isolate *isolate, Zone *zone, ZoneList< CharacterRange > *ranges, bool is_one_byte)
Definition regexp-compiler-tonode.cc:1518

v8::internal::CharacterRange::ClampToOneByte
static void ClampToOneByte(ZoneList< CharacterRange > *ranges)
Definition regexp-compiler-tonode.cc:1926

v8::internal::CharacterRange::Singleton
static CharacterRange Singleton(base::uc32 value)
Definition regexp-ast.h:102

v8::internal::CharacterRange::IsCanonical
static V8_EXPORT_PRIVATE bool IsCanonical(const ZoneList< CharacterRange > *ranges)
Definition regexp-compiler-tonode.cc:1636

v8::internal::CharacterRange::List
static ZoneList< CharacterRange > * List(Zone *zone, CharacterRange range)
Definition regexp-ast.h:114

v8::internal::CharacterRange::Intersect
static void Intersect(const ZoneList< CharacterRange > *lhs, const ZoneList< CharacterRange > *rhs, ZoneList< CharacterRange > *dst, Zone *zone)
Definition regexp-compiler-tonode.cc:1801

v8::internal::CharacterRange::Range
static CharacterRange Range(base::uc32 from, base::uc32 to)
Definition regexp-ast.h:105

v8::internal::CharacterRange::Everything
static CharacterRange Everything()
Definition regexp-ast.h:110

v8::internal::CharacterRange::AddClassEscape
static V8_EXPORT_PRIVATE void AddClassEscape(StandardCharacterSet standard_character_set, ZoneList< CharacterRange > *ranges, bool add_unicode_case_equivalents, Zone *zone)
Definition regexp-compiler-tonode.cc:1455

v8::internal::CharacterSet::set_standard_set_type
void set_standard_set_type(StandardCharacterSet standard_set_type)
Definition regexp-ast.h:294

v8::internal::CharacterSet::Canonicalize
V8_EXPORT_PRIVATE void Canonicalize()
Definition regexp-compiler-tonode.cc:1733

v8::internal::CharacterSet::is_standard
bool is_standard() const
Definition regexp-ast.h:297

v8::internal::CharacterSet::ranges
ZoneList< CharacterRange > * ranges(Zone *zone)
Definition regexp-compiler-tonode.cc:1649

v8::internal::ChoiceNode
Definition regexp-nodes.h:631

v8::internal::Guard::LT
@ LT
Definition regexp-nodes.h:603

v8::internal::Guard::GEQ
@ GEQ
Definition regexp-nodes.h:603

v8::internal::GuardedAlternative
Definition regexp-nodes.h:615

v8::internal::Isolate
Definition isolate.h:586

v8::internal::NegativeLookaroundChoiceNode
Definition regexp-nodes.h:696

v8::internal::NegativeSubmatchSuccess
Definition regexp-nodes.h:582

v8::internal::RegExpAlternative
Definition regexp-ast.h:240

v8::internal::RegExpAssertion::Type
Type
Definition regexp-ast.h:261

v8::internal::RegExpAssertion::Type::BOUNDARY
@ BOUNDARY

v8::internal::RegExpAtom
Definition regexp-ast.h:472

v8::internal::RegExpAtom::data
base::Vector< const base::uc16 > data() const
Definition regexp-ast.h:483

v8::internal::RegExpAtom::length
int length() const
Definition regexp-ast.h:484

v8::internal::RegExpCapture::StartRegister
static int StartRegister(int index)
Definition regexp-ast.h:621

v8::internal::RegExpCapture::ToNode
static RegExpNode * ToNode(RegExpTree *body, int index, RegExpCompiler *compiler, RegExpNode *on_success)
Definition regexp-compiler-tonode.cc:1301

v8::internal::RegExpCapture::EndRegister
static int EndRegister(int index)
Definition regexp-ast.h:622

v8::internal::RegExpClassRanges
Definition regexp-ast.h:305

v8::internal::RegExpClassRanges::contains_split_surrogate
bool contains_split_surrogate() const
Definition regexp-ast.h:356

v8::internal::RegExpClassRanges::standard_type
StandardCharacterSet standard_type() const
Definition regexp-ast.h:348

v8::internal::RegExpClassRanges::is_negated
bool is_negated() const
Definition regexp-ast.h:355

v8::internal::RegExpClassRanges::is_standard
bool is_standard(Zone *zone)
Definition regexp-compiler-tonode.cc:96

v8::internal::RegExpClassRanges::IS_CASE_FOLDED
@ IS_CASE_FOLDED
Definition regexp-ast.h:316

v8::internal::RegExpClassRanges::CONTAINS_SPLIT_SURROGATE
@ CONTAINS_SPLIT_SURROGATE
Definition regexp-ast.h:315

v8::internal::RegExpClassRanges::ranges
ZoneList< CharacterRange > * ranges(Zone *zone)
Definition regexp-ast.h:353

v8::internal::RegExpClassRanges::set_
CharacterSet set_
Definition regexp-ast.h:364

v8::internal::RegExpClassRanges::is_case_folded
bool is_case_folded() const
Definition regexp-ast.h:359

v8::internal::RegExpClassRanges::RegExpClassRanges
RegExpClassRanges(Zone *zone, ZoneList< CharacterRange > *ranges, ClassRangesFlags class_ranges_flags=ClassRangesFlags())
Definition regexp-ast.h:320

v8::internal::RegExpClassSetExpression
Definition regexp-ast.h:429

v8::internal::RegExpClassSetExpression::ComputeExpression
static RegExpClassSetOperand * ComputeExpression(RegExpTree *root, ZoneList< CharacterRange > *temp_ranges, Zone *zone)
Definition regexp-compiler-tonode.cc:628

v8::internal::RegExpClassSetExpression::OperationType::kSubtraction
@ kSubtraction

v8::internal::RegExpClassSetExpression::OperationType::kIntersection
@ kIntersection

v8::internal::RegExpClassSetExpression::OperationType::kUnion
@ kUnion

v8::internal::RegExpClassSetOperand
Definition regexp-ast.h:398

v8::internal::RegExpClassSetOperand::strings_
CharacterClassStrings * strings_
Definition regexp-ast.h:424

v8::internal::RegExpClassSetOperand::ranges
ZoneList< CharacterRange > * ranges()
Definition regexp-ast.h:416

v8::internal::RegExpClassSetOperand::Intersect
void Intersect(RegExpClassSetOperand *other, ZoneList< CharacterRange > *temp_ranges, Zone *zone)
Definition regexp-compiler-tonode.cc:589

v8::internal::RegExpClassSetOperand::strings
CharacterClassStrings * strings()
Definition regexp-ast.h:417

v8::internal::RegExpClassSetOperand::has_strings
bool has_strings() const
Definition regexp-ast.h:415

v8::internal::RegExpClassSetOperand::Union
void Union(RegExpClassSetOperand *other, Zone *zone)
Definition regexp-compiler-tonode.cc:579

v8::internal::RegExpClassSetOperand::Subtract
void Subtract(RegExpClassSetOperand *other, ZoneList< CharacterRange > *temp_ranges, Zone *zone)
Definition regexp-compiler-tonode.cc:610

v8::internal::RegExpCompiler
Definition regexp-compiler.h:454

v8::internal::RegExpCompiler::kNoRegister
static const int kNoRegister
Definition regexp-compiler.h:570

v8::internal::RegExpDisjunction
Definition regexp-ast.h:218

v8::internal::RegExpDisjunction::alternatives
ZoneList< RegExpTree * > * alternatives() const
Definition regexp-ast.h:229

v8::internal::RegExpDisjunction::RationalizeConsecutiveAtoms
void RationalizeConsecutiveAtoms(RegExpCompiler *compiler)
Definition regexp-compiler-tonode.cc:871

v8::internal::RegExpDisjunction::FixSingleCharacterDisjunctions
void FixSingleCharacterDisjunctions(RegExpCompiler *compiler)
Definition regexp-compiler-tonode.cc:984

v8::internal::RegExpDisjunction::SortConsecutiveAtoms
bool SortConsecutiveAtoms(RegExpCompiler *compiler)
Definition regexp-compiler-tonode.cc:816

v8::internal::RegExpEmpty
Definition regexp-ast.h:734

v8::internal::RegExpLookaround::Builder
Definition regexp-ast.h:683

v8::internal::RegExpLookaround::Builder::is_positive_
bool is_positive_
Definition regexp-ast.h:692

v8::internal::RegExpLookaround::Builder::on_match_success_
RegExpNode * on_match_success_
Definition regexp-ast.h:693

v8::internal::RegExpLookaround::Builder::ForMatch
RegExpNode * ForMatch(RegExpNode *match)
Definition regexp-compiler-tonode.cc:1258

v8::internal::RegExpLookaround::Builder::on_success_
RegExpNode * on_success_
Definition regexp-ast.h:694

v8::internal::RegExpLookaround::Builder::Builder
Builder(bool is_positive, RegExpNode *on_success, int stack_pointer_register, int position_register, int capture_register_count=0, int capture_register_start=0)
Definition regexp-compiler-tonode.cc:1237

v8::internal::RegExpLookaround::capture_count_
int capture_count_
Definition regexp-ast.h:702

v8::internal::RegExpLookaround::index
int index() const
Definition regexp-ast.h:681

v8::internal::RegExpLookaround::body
RegExpTree * body() const
Definition regexp-ast.h:676

v8::internal::RegExpLookaround::type
Type type() const
Definition regexp-ast.h:680

v8::internal::RegExpLookaround::LOOKBEHIND
@ LOOKBEHIND
Definition regexp-ast.h:659

v8::internal::RegExpLookaround::body_
RegExpTree * body_
Definition regexp-ast.h:700

v8::internal::RegExpLookaround::is_positive_
bool is_positive_
Definition regexp-ast.h:701

v8::internal::RegExpLookaround::is_positive
bool is_positive() const
Definition regexp-ast.h:677

v8::internal::RegExpLookaround::capture_from_
int capture_from_
Definition regexp-ast.h:703

v8::internal::RegExpNode
Definition regexp-nodes.h:133

v8::internal::RegExpNode::zone
Zone * zone() const
Definition regexp-nodes.h:271

v8::internal::RegExpQuantifier::ToNode
static RegExpNode * ToNode(int min, int max, bool is_greedy, RegExpTree *body, RegExpCompiler *compiler, RegExpNode *on_success, bool not_at_start=false)
Definition regexp-compiler-tonode.cc:1078

v8::internal::RegExpText::elements
ZoneList< TextElement > * elements()
Definition regexp-ast.h:538

v8::internal::RegExpTree
Definition regexp-ast.h:194

v8::internal::RegExpTree::ToNode
virtual RegExpNode * ToNode(RegExpCompiler *compiler, RegExpNode *on_success)=0

v8::internal::RegExpTree::min_match
virtual int min_match()=0

v8::internal::RegExpTree::kInfinity
static const int kInfinity
Definition regexp-ast.h:196

v8::internal::RegExpTree::CaptureRegisters
virtual Interval CaptureRegisters()
Definition regexp-ast.h:208

v8::internal::String::kMaxOneByteCharCodeU
static const uint32_t kMaxOneByteCharCodeU
Definition string.h:501

v8::internal::String::kMaxOneByteCharCode
static const int32_t kMaxOneByteCharCode
Definition string.h:500

v8::internal::TextElement::Atom
static TextElement Atom(RegExpAtom *atom)
Definition regexp-compiler.cc:209

v8::internal::TextNode
Definition regexp-nodes.h:425

v8::internal::TextNode::CreateForSurrogatePair
static TextNode * CreateForSurrogatePair(Zone *zone, CharacterRange lead, ZoneList< CharacterRange > *trail_ranges, bool read_backward, RegExpNode *on_success)
Definition regexp-compiler.cc:2515

v8::internal::TextNode::CreateForCharacterRanges
static TextNode * CreateForCharacterRanges(Zone *zone, ZoneList< CharacterRange > *ranges, bool read_backward, RegExpNode *on_success)
Definition regexp-compiler.cc:2504

v8::internal::UnicodeRangeSplitter::lead_surrogates_
CharacterRangeVector lead_surrogates_
Definition regexp-compiler.h:614

v8::internal::UnicodeRangeSplitter::trail_surrogates_
CharacterRangeVector trail_surrogates_
Definition regexp-compiler.h:615

v8::internal::UnicodeRangeSplitter::non_bmp_
CharacterRangeVector non_bmp_
Definition regexp-compiler.h:616

v8::internal::UnicodeRangeSplitter::AddRange
void AddRange(CharacterRange range)
Definition regexp-compiler-tonode.cc:147

v8::internal::UnicodeRangeSplitter::UnicodeRangeSplitter
V8_EXPORT_PRIVATE UnicodeRangeSplitter(ZoneList< CharacterRange > *base)
Definition regexp-compiler-tonode.cc:134

v8::internal::UnicodeRangeSplitter::bmp_
CharacterRangeVector bmp_
Definition regexp-compiler.h:613

v8::internal::ZoneList
Definition zone-type-traits.h:18

v8::internal::ZoneList::length
V8_INLINE int length() const
Definition zone-list.h:101

v8::internal::ZoneList::at
T & at(int i) const
Definition zone-list.h:88

v8::internal::ZoneList::Rewind
V8_INLINE void Rewind(int pos)
Definition zone-list-inl.h:124

v8::internal::ZoneList::is_empty
V8_INLINE bool is_empty() const
Definition zone-list.h:100

v8::internal::ZoneList::Add
void Add(const T &element, Zone *zone)
Definition zone-list-inl.h:19

v8::internal::Zone
Definition zone.h:43

v8::internal::Zone::New
T * New(Args &&... args)
Definition zone.h:114

zone_
Zone * zone_
Definition code-generator-arm.cc:230

globals.h

is_empty
bool is_empty
Definition sweeper.cc:229

count
uint32_t count
Definition debug-coverage.cc:594

info
Handle< SharedFunctionInfo > info
Definition debug-coverage.cc:593

end
int end
Definition debug-coverage.cc:596

current
LineAndColumn current
Definition earley-parser.cc:22

isolate.h

children
std::vector< std::unique_ptr< InstanceTypeTree > > children
Definition instance-type-generator.cc:25

base
OpIndex base
Definition instruction-selector-ia32.cc:65

a
std::optional< TNode< JSArray > > a
Definition js-call-reducer.cc:1757

second
double second
Definition js-temporal-objects.cc:63

result
ZoneVector< RpoNumber > & result
Definition jump-threading.cc:21

to
Point to
Definition liveedit-diff.cc:98

n
int n
Definition mul-fft.cc:296

r
int r
Definition mul-fft.cc:298

unibrow::uchar
unsigned int uchar
Definition unicode.h:21

v8::base::uc32
uint32_t uc32
Definition strings.h:19

v8::base::uc16
uint16_t uc16
Definition strings.h:18

v8::internal::regexp_compiler_constants::kWordRangeCount
constexpr int kWordRangeCount
Definition regexp-compiler.h:35

v8::internal::regexp_compiler_constants::kDigitRanges
constexpr int kDigitRanges[]
Definition regexp-compiler.h:36

v8::internal::regexp_compiler_constants::kLineTerminatorRangeCount
constexpr int kLineTerminatorRangeCount
Definition regexp-compiler.h:43

v8::internal::regexp_compiler_constants::kRangeEndMarker
constexpr base::uc32 kRangeEndMarker
Definition regexp-compiler.h:26

v8::internal::regexp_compiler_constants::kDigitRangeCount
constexpr int kDigitRangeCount
Definition regexp-compiler.h:37

v8::internal::regexp_compiler_constants::kSpaceRangeCount
constexpr int kSpaceRangeCount
Definition regexp-compiler.h:31

v8::internal::regexp_compiler_constants::kWordRanges
constexpr int kWordRanges[]
Definition regexp-compiler.h:33

v8::internal::regexp_compiler_constants::kLineTerminatorRanges
constexpr int kLineTerminatorRanges[]
Definition regexp-compiler.h:41

v8::internal::regexp_compiler_constants::kSpaceRanges
constexpr int kSpaceRanges[]
Definition regexp-compiler.h:27

v8::internal
Definition api-arguments-inl.h:20

v8::internal::kNonBmpEnd
static const base::uc32 kNonBmpEnd
Definition regexp-macro-assembler.h:27

v8::internal::kNonBmpStart
static const base::uc32 kNonBmpStart
Definition gen-regexp-special-case.cc:18

v8::internal::kBitsPerByte
constexpr int kBitsPerByte
Definition globals.h:682

v8::internal::kTrailSurrogateStart
static const base::uc32 kTrailSurrogateStart
Definition regexp-macro-assembler.h:24

v8::internal::cc
@ cc
Definition constants-arm.h:88

v8::internal::IsEitherUnicode
constexpr bool IsEitherUnicode(RegExpFlags f)
Definition regexp-flags.h:61

v8::internal::internal
internal
Definition wasm-objects-inl.h:458

v8::internal::NeedsUnicodeCaseEquivalents
bool NeedsUnicodeCaseEquivalents(RegExpFlags flags)
Definition regexp-compiler.h:53

v8::internal::flags
Flag flags[]
Definition flags.cc:3797

v8::internal::kMaxUtf16CodeUnitU
constexpr uint32_t kMaxUtf16CodeUnitU
Definition regexp-compiler-tonode.cc:28

v8::internal::kMaxCodePoint
constexpr base::uc32 kMaxCodePoint
Definition regexp-compiler-tonode.cc:26

v8::internal::RegExpFlags
base::Flags< RegExpFlag > RegExpFlags
Definition regexp-flags.h:51

v8::internal::kTrailSurrogateEnd
static const base::uc32 kTrailSurrogateEnd
Definition regexp-macro-assembler.h:25

v8::internal::kUInt32Size
constexpr int kUInt32Size
Definition globals.h:403

v8::internal::kLeadSurrogateStart
static const base::uc32 kLeadSurrogateStart
Definition regexp-macro-assembler.h:22

v8::internal::UNREACHABLE
UNREACHABLE()

v8::internal::kLeadSurrogateEnd
static const base::uc32 kLeadSurrogateEnd
Definition regexp-macro-assembler.h:23

v8::internal::StandardCharacterSet
StandardCharacterSet
Definition regexp-ast.h:82

v8::internal::StandardCharacterSet::kNotLineTerminator
@ kNotLineTerminator

v8::internal::StandardCharacterSet::kNotWord
@ kNotWord

v8::internal::StandardCharacterSet::kWord
@ kWord

v8::internal::StandardCharacterSet::kLineTerminator
@ kLineTerminator

v8::internal::StandardCharacterSet::kWhitespace
@ kWhitespace

v8::internal::StandardCharacterSet::kEverything
@ kEverything

v8::internal::StandardCharacterSet::kNotWhitespace
@ kNotWhitespace

v8::internal::StandardCharacterSet::kDigit
@ kDigit

v8::internal::StandardCharacterSet::kNotDigit
@ kNotDigit

v8::internal::kMaxUtf16CodeUnit
constexpr int kMaxUtf16CodeUnit
Definition regexp-compiler-tonode.cc:27

v8::internal::RangeContainsLatin1Equivalents
bool RangeContainsLatin1Equivalents(CharacterRange range)
Definition regexp-compiler.cc:1924

v8::internal::length
size_t length
Definition external-reference.cc:1491

v8
Definition api-arguments-inl.h:19

ok_to_expand_
bool ok_to_expand_
Definition regexp-compiler-tonode.cc:1994

previous_flags_
const RegExpFlags previous_flags_
Definition regexp-compiler-tonode.cc:1213

terms_
ZoneList< RegExpTree * > * terms_
Definition regexp-compiler-tonode.cc:1399

compiler_
RegExpCompiler * compiler_
Definition regexp-compiler-tonode.cc:1212

kMaxExpansionFactor
static const int kMaxExpansionFactor
Definition regexp-compiler-tonode.cc:1966

saved_expansion_factor_
int saved_expansion_factor_
Definition regexp-compiler-tonode.cc:1993

regexp-compiler.h

regexp.h

size
int size
Definition setup-heap-internal.cc:131

special-case.h

DCHECK_LE
#define DCHECK_LE(v1, v2)
Definition logging.h:490

CHECK
#define CHECK(condition)
Definition logging.h:124

DCHECK_NOT_NULL
#define DCHECK_NOT_NULL(val)
Definition logging.h:492

DCHECK_IMPLIES
#define DCHECK_IMPLIES(v1, v2)
Definition logging.h:493

DCHECK_NE
#define DCHECK_NE(v1, v2)
Definition logging.h:486

DCHECK
#define DCHECK(condition)
Definition logging.h:482

DCHECK_LT
#define DCHECK_LT(v1, v2)
Definition logging.h:489

DCHECK_EQ
#define DCHECK_EQ(v1, v2)
Definition logging.h:485

DCHECK_GT
#define DCHECK_GT(v1, v2)
Definition logging.h:487

arraysize
#define arraysize(array)
Definition macros.h:67

DISALLOW_IMPLICIT_CONSTRUCTORS
#define DISALLOW_IMPLICIT_CONSTRUCTORS(TypeName)
Definition macros.h:130

string.h

strings.h

unibrow::Ecma262Canonicalize::kMaxWidth
static const int kMaxWidth
Definition unicode.h:294

unibrow::Ecma262UnCanonicalize::kMaxWidth
static const int kMaxWidth
Definition unicode.h:298

v8::internal::count
Definition v8-fast-api-calls.h:511

unicode-inl.h

V8_NODISCARD
#define V8_NODISCARD
Definition v8config.h:693

body_
const wasm::FunctionBody & body_
Definition wasm-inlining-into-js.cc:370

zone-list-inl.h