v8
V8 is Google’s open source high-performance JavaScript and WebAssembly engine, written in C++.
Loading...
Searching...
No Matches
regexp-compiler-tonode.cc
Go to the documentation of this file.
1// Copyright 2019 the V8 project authors. All rights reserved.
2// Use of this source code is governed by a BSD-style license that can be
3// found in the LICENSE file.
4
9#include "src/regexp/regexp.h"
12
13#ifdef V8_INTL_SUPPORT
14#include "src/base/strings.h"
16#include "unicode/locid.h"
17#include "unicode/uniset.h"
18#include "unicode/utypes.h"
19#endif // V8_INTL_SUPPORT
20
21namespace v8 {
22namespace internal {
23
24using namespace regexp_compiler_constants; // NOLINT(build/namespaces)
25
26constexpr base::uc32 kMaxCodePoint = 0x10ffff;
27constexpr int kMaxUtf16CodeUnit = 0xffff;
28constexpr uint32_t kMaxUtf16CodeUnitU = 0xffff;
29
30// -------------------------------------------------------------------
31// Tree to graph conversion
32
34 RegExpNode* on_success) {
36 compiler->zone()->New<ZoneList<TextElement>>(1, compiler->zone());
37 elms->Add(TextElement::Atom(this), compiler->zone());
38 return compiler->zone()->New<TextNode>(elms, compiler->read_backward(),
39 on_success);
40}
41
42RegExpNode* RegExpText::ToNode(RegExpCompiler* compiler,
43 RegExpNode* on_success) {
44 return compiler->zone()->New<TextNode>(elements(), compiler->read_backward(),
45 on_success);
46}
47
48namespace {
49
50bool CompareInverseRanges(ZoneList<CharacterRange>* ranges,
51 const int* special_class, int length) {
52 length--; // Remove final marker.
53
54 DCHECK_EQ(kRangeEndMarker, special_class[length]);
55 DCHECK_NE(0, ranges->length());
56 DCHECK_NE(0, length);
57 DCHECK_NE(0, special_class[0]);
58
59 if (ranges->length() != (length >> 1) + 1) return false;
60
61 CharacterRange range = ranges->at(0);
62 if (range.from() != 0) return false;
63
64 for (int i = 0; i < length; i += 2) {
65 if (static_cast<base::uc32>(special_class[i]) != (range.to() + 1)) {
66 return false;
67 }
68 range = ranges->at((i >> 1) + 1);
69 if (static_cast<base::uc32>(special_class[i + 1]) != range.from()) {
70 return false;
71 }
72 }
73
74 return range.to() == kMaxCodePoint;
75}
76
77bool CompareRanges(ZoneList<CharacterRange>* ranges, const int* special_class,
78 int length) {
79 length--; // Remove final marker.
80
81 DCHECK_EQ(kRangeEndMarker, special_class[length]);
82 if (ranges->length() * 2 != length) return false;
83
84 for (int i = 0; i < length; i += 2) {
85 CharacterRange range = ranges->at(i >> 1);
86 if (range.from() != static_cast<base::uc32>(special_class[i]) ||
87 range.to() != static_cast<base::uc32>(special_class[i + 1] - 1)) {
88 return false;
89 }
90 }
91 return true;
92}
93
94} // namespace
95
97 // TODO(lrn): Remove need for this function, by not throwing away information
98 // along the way.
99 if (is_negated()) {
100 return false;
101 }
102 if (set_.is_standard()) {
103 return true;
104 }
105 if (CompareRanges(set_.ranges(zone), kSpaceRanges, kSpaceRangeCount)) {
107 return true;
108 }
109 if (CompareInverseRanges(set_.ranges(zone), kSpaceRanges, kSpaceRangeCount)) {
111 return true;
112 }
113 if (CompareInverseRanges(set_.ranges(zone), kLineTerminatorRanges,
116 return true;
117 }
118 if (CompareRanges(set_.ranges(zone), kLineTerminatorRanges,
121 return true;
122 }
123 if (CompareRanges(set_.ranges(zone), kWordRanges, kWordRangeCount)) {
125 return true;
126 }
127 if (CompareInverseRanges(set_.ranges(zone), kWordRanges, kWordRangeCount)) {
129 return true;
130 }
131 return false;
132}
133
135 // The unicode range splitter categorizes given character ranges into:
136 // - Code points from the BMP representable by one code unit.
137 // - Code points outside the BMP that need to be split into
138 // surrogate pairs.
139 // - Lone lead surrogates.
140 // - Lone trail surrogates.
141 // Lone surrogates are valid code points, even though no actual characters.
142 // They require special matching to make sure we do not split surrogate pairs.
143
144 for (int i = 0; i < base->length(); i++) AddRange(base->at(i));
145}
146
148 static constexpr base::uc32 kBmp1Start = 0;
149 static constexpr base::uc32 kBmp1End = kLeadSurrogateStart - 1;
150 static constexpr base::uc32 kBmp2Start = kTrailSurrogateEnd + 1;
151 static constexpr base::uc32 kBmp2End = kNonBmpStart - 1;
152
153 // Ends are all inclusive.
154 static_assert(kBmp1Start == 0);
155 static_assert(kBmp1Start < kBmp1End);
156 static_assert(kBmp1End + 1 == kLeadSurrogateStart);
157 static_assert(kLeadSurrogateStart < kLeadSurrogateEnd);
158 static_assert(kLeadSurrogateEnd + 1 == kTrailSurrogateStart);
160 static_assert(kTrailSurrogateEnd + 1 == kBmp2Start);
161 static_assert(kBmp2Start < kBmp2End);
162 static_assert(kBmp2End + 1 == kNonBmpStart);
163 static_assert(kNonBmpStart < kNonBmpEnd);
164
165 static constexpr base::uc32 kStarts[] = {
167 kBmp2Start, kNonBmpStart,
168 };
169
170 static constexpr base::uc32 kEnds[] = {
171 kBmp1End, kLeadSurrogateEnd, kTrailSurrogateEnd, kBmp2End, kNonBmpEnd,
172 };
173
174 CharacterRangeVector* const kTargets[] = {
176 };
177
178 static constexpr int kCount = arraysize(kStarts);
179 static_assert(kCount == arraysize(kEnds));
180 static_assert(kCount == arraysize(kTargets));
181
182 for (int i = 0; i < kCount; i++) {
183 if (kStarts[i] > range.to()) break;
184 const base::uc32 from = std::max(kStarts[i], range.from());
185 const base::uc32 to = std::min(kEnds[i], range.to());
186 if (from > to) continue;
187 kTargets[i]->emplace_back(CharacterRange::Range(from, to));
188 }
189}
190
191namespace {
192
193// Translates between new and old V8-isms (SmallVector, ZoneList).
194ZoneList<CharacterRange>* ToCanonicalZoneList(
196 if (v->empty()) return nullptr;
197
199 zone->New<ZoneList<CharacterRange>>(static_cast<int>(v->size()), zone);
200 for (size_t i = 0; i < v->size(); i++) {
201 result->Add(v->at(i), zone);
202 }
203
205 return result;
206}
207
208void AddBmpCharacters(RegExpCompiler* compiler, ChoiceNode* result,
209 RegExpNode* on_success, UnicodeRangeSplitter* splitter) {
210 ZoneList<CharacterRange>* bmp =
211 ToCanonicalZoneList(splitter->bmp(), compiler->zone());
212 if (bmp == nullptr) return;
213 result->AddAlternative(GuardedAlternative(TextNode::CreateForCharacterRanges(
214 compiler->zone(), bmp, compiler->read_backward(), on_success)));
215}
216
217using UC16Range = uint32_t; // {from, to} packed into one uint32_t.
218constexpr UC16Range ToUC16Range(base::uc16 from, base::uc16 to) {
219 return (static_cast<uint32_t>(from) << 16) | to;
220}
221constexpr base::uc16 ExtractFrom(UC16Range r) {
222 return static_cast<base::uc16>(r >> 16);
223}
224constexpr base::uc16 ExtractTo(UC16Range r) {
225 return static_cast<base::uc16>(r);
226}
227
228void AddNonBmpSurrogatePairs(RegExpCompiler* compiler, ChoiceNode* result,
229 RegExpNode* on_success,
230 UnicodeRangeSplitter* splitter) {
231 DCHECK(!compiler->one_byte());
232 Zone* const zone = compiler->zone();
233 ZoneList<CharacterRange>* non_bmp =
234 ToCanonicalZoneList(splitter->non_bmp(), zone);
235 if (non_bmp == nullptr) return;
236
237 // Translate each 32-bit code point range into the corresponding 16-bit code
238 // unit representation consisting of the lead- and trail surrogate.
239 //
240 // The generated alternatives are grouped by the leading surrogate to avoid
241 // emitting excessive code. For example, for
242 //
243 // { \ud800[\udc00-\udc01]
244 // , \ud800[\udc05-\udc06]
245 // }
246 //
247 // there's no need to emit matching code for the leading surrogate \ud800
248 // twice. We also create a dedicated grouping for full trailing ranges, i.e.
249 // [dc00-dfff].
250 ZoneUnorderedMap<UC16Range, ZoneList<CharacterRange>*> grouped_by_leading(
251 zone);
252 ZoneList<CharacterRange>* leading_with_full_trailing_range =
253 zone->New<ZoneList<CharacterRange>>(1, zone);
254 const auto AddRange = [&](base::uc16 from_l, base::uc16 to_l,
255 base::uc16 from_t, base::uc16 to_t) {
256 const UC16Range leading_range = ToUC16Range(from_l, to_l);
257 if (grouped_by_leading.count(leading_range) == 0) {
258 if (from_t == kTrailSurrogateStart && to_t == kTrailSurrogateEnd) {
259 leading_with_full_trailing_range->Add(
260 CharacterRange::Range(from_l, to_l), zone);
261 return;
262 }
263 grouped_by_leading[leading_range] =
264 zone->New<ZoneList<CharacterRange>>(2, zone);
265 }
266 grouped_by_leading[leading_range]->Add(CharacterRange::Range(from_t, to_t),
267 zone);
268 };
269
270 // First, create the grouped ranges.
272 for (int i = 0; i < non_bmp->length(); i++) {
273 // Match surrogate pair.
274 // E.g. [\u10005-\u11005] becomes
275 // \ud800[\udc05-\udfff]|
276 // [\ud801-\ud803][\udc00-\udfff]|
277 // \ud804[\udc00-\udc05]
278 base::uc32 from = non_bmp->at(i).from();
279 base::uc32 to = non_bmp->at(i).to();
284
285 if (from_l == to_l) {
286 // The lead surrogate is the same.
287 AddRange(from_l, to_l, from_t, to_t);
288 continue;
289 }
290
291 if (from_t != kTrailSurrogateStart) {
292 // Add [from_l][from_t-\udfff].
293 AddRange(from_l, from_l, from_t, kTrailSurrogateEnd);
294 from_l++;
295 }
296 if (to_t != kTrailSurrogateEnd) {
297 // Add [to_l][\udc00-to_t].
298 AddRange(to_l, to_l, kTrailSurrogateStart, to_t);
299 to_l--;
300 }
301 if (from_l <= to_l) {
302 // Add [from_l-to_l][\udc00-\udfff].
303 AddRange(from_l, to_l, kTrailSurrogateStart, kTrailSurrogateEnd);
304 }
305 }
306
307 // Create the actual TextNode now that ranges are fully grouped.
308 if (!leading_with_full_trailing_range->is_empty()) {
309 CharacterRange::Canonicalize(leading_with_full_trailing_range);
310 result->AddAlternative(GuardedAlternative(TextNode::CreateForSurrogatePair(
311 zone, leading_with_full_trailing_range,
313 compiler->read_backward(), on_success)));
314 }
315 for (const auto& it : grouped_by_leading) {
316 CharacterRange leading_range =
317 CharacterRange::Range(ExtractFrom(it.first), ExtractTo(it.first));
318 ZoneList<CharacterRange>* trailing_ranges = it.second;
319 CharacterRange::Canonicalize(trailing_ranges);
320 result->AddAlternative(GuardedAlternative(TextNode::CreateForSurrogatePair(
321 zone, leading_range, trailing_ranges, compiler->read_backward(),
322 on_success)));
323 }
324}
325
326RegExpNode* NegativeLookaroundAgainstReadDirectionAndMatch(
327 RegExpCompiler* compiler, ZoneList<CharacterRange>* lookbehind,
328 ZoneList<CharacterRange>* match, RegExpNode* on_success,
329 bool read_backward) {
330 Zone* zone = compiler->zone();
331 RegExpNode* match_node = TextNode::CreateForCharacterRanges(
332 zone, match, read_backward, on_success);
333 int stack_register = compiler->UnicodeLookaroundStackRegister();
334 int position_register = compiler->UnicodeLookaroundPositionRegister();
335 RegExpLookaround::Builder lookaround(false, match_node, stack_register,
336 position_register);
337 RegExpNode* negative_match = TextNode::CreateForCharacterRanges(
338 zone, lookbehind, !read_backward, lookaround.on_match_success());
339 return lookaround.ForMatch(negative_match);
340}
341
342RegExpNode* MatchAndNegativeLookaroundInReadDirection(
343 RegExpCompiler* compiler, ZoneList<CharacterRange>* match,
344 ZoneList<CharacterRange>* lookahead, RegExpNode* on_success,
345 bool read_backward) {
346 Zone* zone = compiler->zone();
347 int stack_register = compiler->UnicodeLookaroundStackRegister();
348 int position_register = compiler->UnicodeLookaroundPositionRegister();
349 RegExpLookaround::Builder lookaround(false, on_success, stack_register,
350 position_register);
351 RegExpNode* negative_match = TextNode::CreateForCharacterRanges(
352 zone, lookahead, read_backward, lookaround.on_match_success());
354 zone, match, read_backward, lookaround.ForMatch(negative_match));
355}
356
357void AddLoneLeadSurrogates(RegExpCompiler* compiler, ChoiceNode* result,
358 RegExpNode* on_success,
359 UnicodeRangeSplitter* splitter) {
360 ZoneList<CharacterRange>* lead_surrogates =
361 ToCanonicalZoneList(splitter->lead_surrogates(), compiler->zone());
362 if (lead_surrogates == nullptr) return;
363 Zone* zone = compiler->zone();
364 // E.g. \ud801 becomes \ud801(?![\udc00-\udfff]).
365 ZoneList<CharacterRange>* trail_surrogates = CharacterRange::List(
367
368 RegExpNode* match;
369 if (compiler->read_backward()) {
370 // Reading backward. Assert that reading forward, there is no trail
371 // surrogate, and then backward match the lead surrogate.
372 match = NegativeLookaroundAgainstReadDirectionAndMatch(
373 compiler, trail_surrogates, lead_surrogates, on_success, true);
374 } else {
375 // Reading forward. Forward match the lead surrogate and assert that
376 // no trail surrogate follows.
377 match = MatchAndNegativeLookaroundInReadDirection(
378 compiler, lead_surrogates, trail_surrogates, on_success, false);
379 }
380 result->AddAlternative(GuardedAlternative(match));
381}
382
383void AddLoneTrailSurrogates(RegExpCompiler* compiler, ChoiceNode* result,
384 RegExpNode* on_success,
385 UnicodeRangeSplitter* splitter) {
386 ZoneList<CharacterRange>* trail_surrogates =
387 ToCanonicalZoneList(splitter->trail_surrogates(), compiler->zone());
388 if (trail_surrogates == nullptr) return;
389 Zone* zone = compiler->zone();
390 // E.g. \udc01 becomes (?<![\ud800-\udbff])\udc01
391 ZoneList<CharacterRange>* lead_surrogates = CharacterRange::List(
393
394 RegExpNode* match;
395 if (compiler->read_backward()) {
396 // Reading backward. Backward match the trail surrogate and assert that no
397 // lead surrogate precedes it.
398 match = MatchAndNegativeLookaroundInReadDirection(
399 compiler, trail_surrogates, lead_surrogates, on_success, true);
400 } else {
401 // Reading forward. Assert that reading backward, there is no lead
402 // surrogate, and then forward match the trail surrogate.
403 match = NegativeLookaroundAgainstReadDirectionAndMatch(
404 compiler, lead_surrogates, trail_surrogates, on_success, false);
405 }
406 result->AddAlternative(GuardedAlternative(match));
407}
408
409RegExpNode* UnanchoredAdvance(RegExpCompiler* compiler,
410 RegExpNode* on_success) {
411 // This implements ES2015 21.2.5.2.3, AdvanceStringIndex.
412 DCHECK(!compiler->read_backward());
413 Zone* zone = compiler->zone();
414 // Advance any character. If the character happens to be a lead surrogate and
415 // we advanced into the middle of a surrogate pair, it will work out, as
416 // nothing will match from there. We will have to advance again, consuming
417 // the associated trail surrogate.
418 ZoneList<CharacterRange>* range =
420 return TextNode::CreateForCharacterRanges(zone, range, false, on_success);
421}
422
423} // namespace
424
425// static
426// Only for /ui and /vi, not for /i regexps.
428 Zone* zone) {
429#ifdef V8_INTL_SUPPORT
430 DCHECK(IsCanonical(ranges));
431
432 // Micro-optimization to avoid passing large ranges to UnicodeSet::closeOver.
433 // See also https://crbug.com/v8/6727.
434 // TODO(jgruber): This only covers the special case of the {0,0x10FFFF} range,
435 // which we use frequently internally. But large ranges can also easily be
436 // created by the user. We might want to have a more general caching mechanism
437 // for such ranges.
438 if (ranges->length() == 1 && ranges->at(0).IsEverything(kNonBmpEnd)) return;
439
440 // Use ICU to compute the case fold closure over the ranges.
441 icu::UnicodeSet set;
442 for (int i = 0; i < ranges->length(); i++) {
443 set.add(ranges->at(i).from(), ranges->at(i).to());
444 }
445 // Clear the ranges list without freeing the backing store.
446 ranges->Rewind(0);
447 set.closeOver(USET_SIMPLE_CASE_INSENSITIVE);
448 for (int i = 0; i < set.getRangeCount(); i++) {
449 ranges->Add(Range(set.getRangeStart(i), set.getRangeEnd(i)), zone);
450 }
451 // No errors and everything we collected have been ranges.
452 Canonicalize(ranges);
453#endif // V8_INTL_SUPPORT
454}
455
457 RegExpNode* on_success) {
459 Zone* const zone = compiler->zone();
460 ZoneList<CharacterRange>* ranges = this->ranges(zone);
461
462 const bool needs_case_folding =
463 NeedsUnicodeCaseEquivalents(compiler->flags()) && !is_case_folded();
464 if (needs_case_folding) {
466 }
467
468 if (!IsEitherUnicode(compiler->flags()) || compiler->one_byte() ||
470 return zone->New<TextNode>(this, compiler->read_backward(), on_success);
471 }
472
473 if (is_negated()) {
474 // With /v, character classes are never negated.
475 // https://tc39.es/ecma262/#sec-compileatom
476 // Atom :: CharacterClass
477 // 4. Assert: cc.[[Invert]] is false.
478 // Instead the complement is created when evaluating the class set.
479 // The only exception is the "nothing range" (negated everything), which is
480 // internally created for an empty set.
482 IsUnicodeSets(compiler->flags()),
483 ranges->length() == 1 && ranges->first().IsEverything(kMaxCodePoint));
484 ZoneList<CharacterRange>* negated =
485 zone->New<ZoneList<CharacterRange>>(2, zone);
486 CharacterRange::Negate(ranges, negated, zone);
487 ranges = negated;
488 }
489
490 if (ranges->length() == 0) {
491 // The empty character class is used as a 'fail' node.
492 RegExpClassRanges* fail = zone->New<RegExpClassRanges>(zone, ranges);
493 return zone->New<TextNode>(fail, compiler->read_backward(), on_success);
494 }
495
496 if (set_.is_standard() &&
498 return UnanchoredAdvance(compiler, on_success);
499 }
500
501 // Split ranges in order to handle surrogates correctly:
502 // - Surrogate pairs: translate the 32-bit code point into two uc16 code
503 // units (irregexp operates only on code units).
504 // - Lone surrogates: these require lookarounds to ensure we don't match in
505 // the middle of a surrogate pair.
506 ChoiceNode* result = zone->New<ChoiceNode>(2, zone);
507 UnicodeRangeSplitter splitter(ranges);
508 AddBmpCharacters(compiler, result, on_success, &splitter);
509 AddNonBmpSurrogatePairs(compiler, result, on_success, &splitter);
510 AddLoneLeadSurrogates(compiler, result, on_success, &splitter);
511 AddLoneTrailSurrogates(compiler, result, on_success, &splitter);
512
513 static constexpr int kMaxRangesToInline = 32; // Arbitrary.
514 if (ranges->length() > kMaxRangesToInline) result->SetDoNotInline();
515
516 return result;
517}
518
519RegExpNode* RegExpClassSetOperand::ToNode(RegExpCompiler* compiler,
520 RegExpNode* on_success) {
521 Zone* zone = compiler->zone();
522 const int size = (has_strings() ? static_cast<int>(strings()->size()) : 0) +
523 (ranges()->is_empty() ? 0 : 1);
524 if (size == 0) {
525 // If neither ranges nor strings are present, the operand is equal to an
526 // empty range (matching nothing).
527 ZoneList<CharacterRange>* empty =
528 zone->template New<ZoneList<CharacterRange>>(0, zone);
529 return zone->template New<RegExpClassRanges>(zone, empty)
530 ->ToNode(compiler, on_success);
531 }
532 ZoneList<RegExpTree*>* alternatives =
533 zone->template New<ZoneList<RegExpTree*>>(size, zone);
534 // Strings are sorted by length first (larger strings before shorter ones).
535 // See the comment on CharacterClassStrings.
536 // Empty strings (if present) are added after character ranges.
537 RegExpTree* empty_string = nullptr;
538 if (has_strings()) {
539 for (auto string : *strings()) {
540 if (string.second->IsEmpty()) {
541 empty_string = string.second;
542 } else {
543 alternatives->Add(string.second, zone);
544 }
545 }
546 }
547 if (!ranges()->is_empty()) {
548 // In unicode sets mode case folding has to be done at precise locations
549 // (e.g. before building complements).
550 // It is therefore the parsers responsibility to case fold (sub-) ranges
551 // before creating ClassSetOperands.
552 alternatives->Add(zone->template New<RegExpClassRanges>(
553 zone, ranges(), RegExpClassRanges::IS_CASE_FOLDED),
554 zone);
555 }
556 if (empty_string != nullptr) {
557 alternatives->Add(empty_string, zone);
558 }
559
560 RegExpTree* node = nullptr;
561 if (size == 1) {
562 DCHECK_EQ(alternatives->length(), 1);
563 node = alternatives->first();
564 } else {
565 node = zone->template New<RegExpDisjunction>(alternatives);
566 }
567 return node->ToNode(compiler, on_success);
568}
569
570RegExpNode* RegExpClassSetExpression::ToNode(RegExpCompiler* compiler,
571 RegExpNode* on_success) {
572 Zone* zone = compiler->zone();
573 ZoneList<CharacterRange>* temp_ranges =
574 zone->template New<ZoneList<CharacterRange>>(4, zone);
575 RegExpClassSetOperand* root = ComputeExpression(this, temp_ranges, zone);
576 return root->ToNode(compiler, on_success);
577}
578
580 ranges()->AddAll(*other->ranges(), zone);
581 if (other->has_strings()) {
582 if (strings_ == nullptr) {
583 strings_ = zone->template New<CharacterClassStrings>(zone);
584 }
585 strings()->insert(other->strings()->begin(), other->strings()->end());
586 }
587}
588
590 ZoneList<CharacterRange>* temp_ranges,
591 Zone* zone) {
592 CharacterRange::Intersect(ranges(), other->ranges(), temp_ranges, zone);
593 std::swap(*ranges(), *temp_ranges);
594 temp_ranges->Rewind(0);
595 if (has_strings()) {
596 if (!other->has_strings()) {
597 strings()->clear();
598 } else {
599 for (auto iter = strings()->begin(); iter != strings()->end();) {
600 if (other->strings()->find(iter->first) == other->strings()->end()) {
601 iter = strings()->erase(iter);
602 } else {
603 iter++;
604 }
605 }
606 }
607 }
608}
609
611 ZoneList<CharacterRange>* temp_ranges,
612 Zone* zone) {
613 CharacterRange::Subtract(ranges(), other->ranges(), temp_ranges, zone);
614 std::swap(*ranges(), *temp_ranges);
615 temp_ranges->Rewind(0);
616 if (has_strings() && other->has_strings()) {
617 for (auto iter = strings()->begin(); iter != strings()->end();) {
618 if (other->strings()->find(iter->first) != other->strings()->end()) {
619 iter = strings()->erase(iter);
620 } else {
621 iter++;
622 }
623 }
624 }
625}
626
627// static
629 RegExpTree* root, ZoneList<CharacterRange>* temp_ranges, Zone* zone) {
630 DCHECK(temp_ranges->is_empty());
631 if (root->IsClassSetOperand()) {
632 return root->AsClassSetOperand();
633 }
634 DCHECK(root->IsClassSetExpression());
635 RegExpClassSetExpression* node = root->AsClassSetExpression();
637 ComputeExpression(node->operands()->at(0), temp_ranges, zone);
638 switch (node->operation()) {
640 for (int i = 1; i < node->operands()->length(); i++) {
642 ComputeExpression(node->operands()->at(i), temp_ranges, zone);
643 result->Union(op, zone);
644 }
646 break;
647 }
649 for (int i = 1; i < node->operands()->length(); i++) {
651 ComputeExpression(node->operands()->at(i), temp_ranges, zone);
652 result->Intersect(op, temp_ranges, zone);
653 }
654 break;
655 }
657 for (int i = 1; i < node->operands()->length(); i++) {
659 ComputeExpression(node->operands()->at(i), temp_ranges, zone);
660 result->Subtract(op, temp_ranges, zone);
661 }
662 break;
663 }
664 }
665 if (node->is_negated()) {
666 DCHECK(!result->has_strings());
667 CharacterRange::Negate(result->ranges(), temp_ranges, zone);
668 std::swap(*result->ranges(), *temp_ranges);
669 temp_ranges->Rewind(0);
670 node->is_negated_ = false;
671 }
672 // Store the result as single operand of the current node.
673 node->operands()->Set(0, result);
674 node->operands()->Rewind(1);
675
676 return result;
677}
678
679namespace {
680
681int CompareCharAt(RegExpAtom* a, int index_a, RegExpAtom* b, int index_b) {
682 base::uc16 character1 = a->data().at(index_a);
683 base::uc16 character2 = b->data().at(index_b);
684 if (character1 < character2) return -1;
685 if (character1 > character2) return 1;
686 return 0;
687}
688
689int CompareFirstChar(RegExpTree* const* a, RegExpTree* const* b) {
690 RegExpAtom* atom1 = (*a)->AsAtom();
691 RegExpAtom* atom2 = (*b)->AsAtom();
692 return CompareCharAt(atom1, 0, atom2, 0);
693}
694
695int CompareLastChar(RegExpTree* const* a, RegExpTree* const* b) {
696 RegExpAtom* atom1 = (*a)->AsAtom();
697 RegExpAtom* atom2 = (*b)->AsAtom();
698 return CompareCharAt(atom1, atom1->length() - 1, atom2, atom2->length() - 1);
699}
700
701#ifdef V8_INTL_SUPPORT
702
703int CompareCaseInsensitive(const icu::UnicodeString& a,
704 const icu::UnicodeString& b) {
705 return a.caseCompare(b, U_FOLD_CASE_DEFAULT);
706}
707
708int CompareCharAtCaseInsensitive(RegExpAtom* a, int index_a, RegExpAtom* b,
709 int index_b) {
710 base::uc16 character1 = a->data().at(index_a);
711 base::uc16 character2 = b->data().at(index_b);
712 return CompareCaseInsensitive(icu::UnicodeString{character1},
713 icu::UnicodeString{character2});
714}
715
716int CompareFirstCharCaseInsensitive(RegExpTree* const* a,
717 RegExpTree* const* b) {
718 RegExpAtom* atom1 = (*a)->AsAtom();
719 RegExpAtom* atom2 = (*b)->AsAtom();
720 return CompareCharAtCaseInsensitive(atom1, 0, atom2, 0);
721}
722
723int CompareLastCharCaseInsensitive(RegExpTree* const* a, RegExpTree* const* b) {
724 RegExpAtom* atom1 = (*a)->AsAtom();
725 RegExpAtom* atom2 = (*b)->AsAtom();
726 return CompareCharAtCaseInsensitive(atom1, atom1->length() - 1, atom2,
727 atom2->length() - 1);
728}
729
730bool Equals(bool ignore_case, const icu::UnicodeString& a,
731 const icu::UnicodeString& b) {
732 if (a == b) return true;
733 if (ignore_case) return CompareCaseInsensitive(a, b) == 0;
734 return false; // Case-sensitive equality already checked above.
735}
736
737bool CharAtEquals(bool ignore_case, const RegExpAtom* a, int index_a,
738 const RegExpAtom* b, int index_b) {
739 return Equals(ignore_case, a->data().at(index_a), b->data().at(index_b));
740}
741
742#else
743
744unibrow::uchar Canonical(
746 unibrow::uchar c) {
748 int length = canonicalize->get(c, '\0', chars);
749 DCHECK_LE(length, 1);
750 unibrow::uchar canonical = c;
751 if (length == 1) canonical = chars[0];
752 return canonical;
753}
754
755int CompareCaseInsensitive(
758 if (a == b) return 0;
759 if (a >= 'a' || b >= 'a') {
760 a = Canonical(canonicalize, a);
761 b = Canonical(canonicalize, b);
762 }
763 return static_cast<int>(a) - static_cast<int>(b);
764}
765
766int CompareCharAtCaseInsensitive(
767 unibrow::Mapping<unibrow::Ecma262Canonicalize>* canonicalize, RegExpAtom* a,
768 int index_a, RegExpAtom* b, int index_b) {
769 base::uc16 character1 = a->data().at(index_a);
770 base::uc16 character2 = b->data().at(index_b);
771 return CompareCaseInsensitive(canonicalize, character1, character2);
772}
773int CompareFirstCharCaseInsensitive(
775 RegExpTree* const* a, RegExpTree* const* b) {
776 RegExpAtom* atom1 = (*a)->AsAtom();
777 RegExpAtom* atom2 = (*b)->AsAtom();
778 return CompareCharAtCaseInsensitive(canonicalize, atom1, 0, atom2, 0);
779}
780
781int CompareLastCharCaseInsensitive(
783 RegExpTree* const* a, RegExpTree* const* b) {
784 RegExpAtom* atom1 = (*a)->AsAtom();
785 RegExpAtom* atom2 = (*b)->AsAtom();
786 return CompareCharAtCaseInsensitive(canonicalize, atom1, atom1->length() - 1,
787 atom2, atom2->length() - 1);
788}
789
790bool Equals(bool ignore_case,
793 if (a == b) return true;
794 if (ignore_case) {
795 return CompareCaseInsensitive(canonicalize, a, b) == 0;
796 }
797 return false; // Case-sensitive equality already checked above.
798}
799
800bool CharAtEquals(bool ignore_case,
802 const RegExpAtom* a, int index_a, const RegExpAtom* b,
803 int index_b) {
804 return Equals(ignore_case, canonicalize, a->data().at(index_a),
805 b->data().at(index_b));
806}
807
808#endif // V8_INTL_SUPPORT
809
810} // namespace
811
812// We can stable sort runs of atoms, since the order does not matter if they
813// start with different characters when reading forwards, or end with different
814// characters when reading backwards.
815// Returns true if any consecutive atoms were found.
818 int length = alternatives->length();
819 bool found_consecutive_atoms = false;
820 for (int i = 0; i < length; i++) {
821 while (i < length) {
822 RegExpTree* alternative = alternatives->at(i);
823 if (alternative->IsAtom()) break;
824 i++;
825 }
826 // i is length or it is the index of an atom.
827 if (i == length) break;
828 int first_atom = i;
829 i++;
830 while (i < length) {
831 RegExpTree* alternative = alternatives->at(i);
832 if (!alternative->IsAtom()) break;
833 i++;
834 }
835 // Sort atoms to get ones with common prefixes together.
836 // This step is more tricky if we are in a case-independent regexp,
837 // because it would change /is|I/ to /I|is/, and order matters when
838 // the regexp parts don't match only disjoint starting points. To fix
839 // this we have a version of CompareFirstChar that uses case-
840 // independent character classes for comparison.
841 DCHECK_LT(first_atom, alternatives->length());
843 DCHECK_LE(first_atom, i);
844 const bool backwards = compiler->read_backward();
845 if (IsIgnoreCase(compiler->flags())) {
846#ifdef V8_INTL_SUPPORT
847 auto cmp_fun = backwards ? CompareLastCharCaseInsensitive
848 : CompareFirstCharCaseInsensitive;
849 alternatives->StableSort(cmp_fun, first_atom, i - first_atom);
850#else
852 compiler->isolate()->regexp_macro_assembler_canonicalize();
853 auto compare_closure = [canonicalize, backwards](RegExpTree* const* a,
854 RegExpTree* const* b) {
855 return backwards ? CompareLastCharCaseInsensitive(canonicalize, a, b)
856 : CompareFirstCharCaseInsensitive(canonicalize, a, b);
857 };
858 alternatives->StableSort(compare_closure, first_atom, i - first_atom);
859#endif // V8_INTL_SUPPORT
860 } else {
861 auto cmp_fun = backwards ? CompareLastChar : CompareFirstChar;
862 alternatives->StableSort(cmp_fun, first_atom, i - first_atom);
863 }
864 if (i - first_atom > 1) found_consecutive_atoms = true;
865 }
866 return found_consecutive_atoms;
867}
868
869// Optimizes a common prefix when reading forwards, or suffix when reading
870// backwards. E.g. turns ab|ac|ad into a(?:b|c|d).
872 Zone* zone = compiler->zone();
873 const bool backwards = compiler->read_backward();
875 int length = alternatives->length();
876 const bool ignore_case = IsIgnoreCase(compiler->flags());
877
878 int write_posn = 0;
879 int i = 0;
880 while (i < length) {
881 RegExpTree* alternative = alternatives->at(i);
882 if (!alternative->IsAtom()) {
883 alternatives->at(write_posn++) = alternatives->at(i);
884 i++;
885 continue;
886 }
887 RegExpAtom* const atom = alternative->AsAtom();
888#ifdef V8_INTL_SUPPORT
889 icu::UnicodeString common_affix(
890 atom->data().at(backwards ? atom->length() - 1 : 0));
891#else
893 compiler->isolate()->regexp_macro_assembler_canonicalize();
894 unibrow::uchar common_affix =
895 atom->data().at(backwards ? atom->length() - 1 : 0);
896 if (ignore_case) {
897 common_affix = Canonical(canonicalize, common_affix);
898 }
899#endif // V8_INTL_SUPPORT
900 int first_with_affix = i;
901 int affix_length = atom->length();
902 i++;
903 while (i < length) {
904 alternative = alternatives->at(i);
905 if (!alternative->IsAtom()) break;
906 RegExpAtom* const alt_atom = alternative->AsAtom();
907#ifdef V8_INTL_SUPPORT
908 icu::UnicodeString new_affix(
909 alt_atom->data().at(backwards ? alt_atom->length() - 1 : 0));
910 if (!Equals(ignore_case, new_affix, common_affix)) break;
911#else
912 unibrow::uchar new_affix =
913 alt_atom->data().at(backwards ? alt_atom->length() - 1 : 0);
914 if (!Equals(ignore_case, canonicalize, new_affix, common_affix)) break;
915#endif // V8_INTL_SUPPORT
916 affix_length = std::min(affix_length, alt_atom->length());
917 i++;
918 }
919 if (i > first_with_affix + 2) {
920 // Found worthwhile run of alternatives with common affix of at least one
921 // character. The sorting function above did not sort on more than one
922 // character for reasons of correctness, but there may still be a longer
923 // common affix if the terms were similar or presorted in the input.
924 // Find out how long the common affix is.
925 int run_length = i - first_with_affix;
926 RegExpAtom* const alt_atom = alternatives->at(first_with_affix)->AsAtom();
927 for (int j = 1; j < run_length && affix_length > 1; j++) {
928 RegExpAtom* old_atom = alternatives->at(j + first_with_affix)->AsAtom();
929 for (int k = 1; k < affix_length; k++) {
930 const int alt_atom_pos = backwards ? alt_atom->length() - k : k;
931 const int old_atom_pos = backwards ? old_atom->length() - k : k;
932#ifdef V8_INTL_SUPPORT
933 if (!CharAtEquals(ignore_case, alt_atom, alt_atom_pos, old_atom,
934 old_atom_pos)) {
935#else
936 if (!CharAtEquals(ignore_case, canonicalize, alt_atom, alt_atom_pos,
937 old_atom, old_atom_pos)) {
938#endif // V8_INTL_SUPPORT
939 affix_length = k;
940 break;
941 }
942 }
943 }
944 const int common_start =
945 backwards ? alt_atom->length() - affix_length : 0;
946 RegExpAtom* common = zone->New<RegExpAtom>(alt_atom->data().SubVector(
947 common_start, common_start + affix_length));
948 ZoneList<RegExpTree*>* distinct =
949 zone->New<ZoneList<RegExpTree*>>(run_length, zone);
950 for (int j = 0; j < run_length; j++) {
951 RegExpAtom* old_atom = alternatives->at(j + first_with_affix)->AsAtom();
952 int len = old_atom->length();
953 if (len == affix_length) {
954 distinct->Add(zone->New<RegExpEmpty>(), zone);
955 } else {
956 const int distinct_start = backwards ? 0 : affix_length;
957 const int distinct_end = backwards ? old_atom->length() - affix_length
958 : old_atom->length();
959 RegExpTree* part = zone->New<RegExpAtom>(
960 old_atom->data().SubVector(distinct_start, distinct_end));
961 distinct->Add(part, zone);
962 }
963 }
964 ZoneList<RegExpTree*>* pair = zone->New<ZoneList<RegExpTree*>>(2, zone);
965 if (backwards) {
966 pair->Add(zone->New<RegExpDisjunction>(distinct), zone);
967 pair->Add(common, zone);
968 } else {
969 pair->Add(common, zone);
970 pair->Add(zone->New<RegExpDisjunction>(distinct), zone);
971 }
972 alternatives->at(write_posn++) = zone->New<RegExpAlternative>(pair);
973 } else {
974 // Just copy any non-worthwhile alternatives.
975 for (int j = first_with_affix; j < i; j++) {
976 alternatives->at(write_posn++) = alternatives->at(j);
977 }
978 }
979 }
980 alternatives->Rewind(write_posn); // Trim end of array.
981}
982
983// Optimizes b|c|z to [bcz].
985 RegExpCompiler* compiler) {
986 Zone* zone = compiler->zone();
988 int length = alternatives->length();
989
990 int write_posn = 0;
991 int i = 0;
992 while (i < length) {
993 RegExpTree* alternative = alternatives->at(i);
994 if (!alternative->IsAtom()) {
995 alternatives->at(write_posn++) = alternatives->at(i);
996 i++;
997 continue;
998 }
999 RegExpAtom* const atom = alternative->AsAtom();
1000 if (atom->length() != 1) {
1001 alternatives->at(write_posn++) = alternatives->at(i);
1002 i++;
1003 continue;
1004 }
1005 const RegExpFlags flags = compiler->flags();
1008 bool contains_trail_surrogate =
1010 int first_in_run = i;
1011 i++;
1012 // Find a run of single-character atom alternatives that have identical
1013 // flags (case independence and unicode-ness).
1014 while (i < length) {
1015 alternative = alternatives->at(i);
1016 if (!alternative->IsAtom()) break;
1017 RegExpAtom* const alt_atom = alternative->AsAtom();
1018 if (alt_atom->length() != 1) break;
1020 !unibrow::Utf16::IsLeadSurrogate(alt_atom->data().at(0)));
1021 contains_trail_surrogate |=
1023 i++;
1024 }
1025 if (i > first_in_run + 1) {
1026 // Found non-trivial run of single-character alternatives.
1027 int run_length = i - first_in_run;
1028 ZoneList<CharacterRange>* ranges =
1029 zone->New<ZoneList<CharacterRange>>(2, zone);
1030 for (int j = 0; j < run_length; j++) {
1031 RegExpAtom* old_atom = alternatives->at(j + first_in_run)->AsAtom();
1032 DCHECK_EQ(old_atom->length(), 1);
1033 ranges->Add(CharacterRange::Singleton(old_atom->data().at(0)), zone);
1034 }
1035 RegExpClassRanges::ClassRangesFlags class_ranges_flags;
1036 if (IsEitherUnicode(flags) && contains_trail_surrogate) {
1038 }
1039 alternatives->at(write_posn++) =
1040 zone->New<RegExpClassRanges>(zone, ranges, class_ranges_flags);
1041 } else {
1042 // Just copy any trivial alternatives.
1043 for (int j = first_in_run; j < i; j++) {
1044 alternatives->at(write_posn++) = alternatives->at(j);
1045 }
1046 }
1047 }
1048 alternatives->Rewind(write_posn); // Trim end of array.
1049}
1050
1052 RegExpNode* on_success) {
1053 compiler->ToNodeMaybeCheckForStackOverflow();
1054
1056
1057 if (alternatives->length() > 2) {
1058 bool found_consecutive_atoms = SortConsecutiveAtoms(compiler);
1059 if (found_consecutive_atoms) RationalizeConsecutiveAtoms(compiler);
1061 if (alternatives->length() == 1) {
1062 return alternatives->at(0)->ToNode(compiler, on_success);
1063 }
1064 }
1065
1066 int length = alternatives->length();
1067
1068 ChoiceNode* result =
1069 compiler->zone()->New<ChoiceNode>(length, compiler->zone());
1070 for (int i = 0; i < length; i++) {
1071 GuardedAlternative alternative(
1072 alternatives->at(i)->ToNode(compiler, on_success));
1073 result->AddAlternative(alternative);
1074 }
1075 return result;
1076}
1077
1079 RegExpNode* on_success) {
1080 return ToNode(min(), max(), is_greedy(), body(), compiler, on_success);
1081}
1082
1083namespace {
1084// Desugar \b to (?<=\w)(?=\W)|(?<=\W)(?=\w) and
1085// \B to (?<=\w)(?=\w)|(?<=\W)(?=\W)
1086RegExpNode* BoundaryAssertionAsLookaround(RegExpCompiler* compiler,
1087 RegExpNode* on_success,
1088 RegExpAssertion::Type type) {
1089 CHECK(NeedsUnicodeCaseEquivalents(compiler->flags()));
1090 Zone* zone = compiler->zone();
1091 ZoneList<CharacterRange>* word_range =
1092 zone->New<ZoneList<CharacterRange>>(2, zone);
1094 zone);
1095 int stack_register = compiler->UnicodeLookaroundStackRegister();
1096 int position_register = compiler->UnicodeLookaroundPositionRegister();
1097 ChoiceNode* result = zone->New<ChoiceNode>(2, zone);
1098 // Add two choices. The (non-)boundary could start with a word or
1099 // a non-word-character.
1100 for (int i = 0; i < 2; i++) {
1101 bool lookbehind_for_word = i == 0;
1102 bool lookahead_for_word =
1103 (type == RegExpAssertion::Type::BOUNDARY) ^ lookbehind_for_word;
1104 // Look to the left.
1105 RegExpLookaround::Builder lookbehind(lookbehind_for_word, on_success,
1106 stack_register, position_register);
1108 zone, word_range, true, lookbehind.on_match_success());
1109 // Look to the right.
1110 RegExpLookaround::Builder lookahead(lookahead_for_word,
1111 lookbehind.ForMatch(backward),
1112 stack_register, position_register);
1114 zone, word_range, false, lookahead.on_match_success());
1115 result->AddAlternative(GuardedAlternative(lookahead.ForMatch(forward)));
1116 }
1117 return result;
1118}
1119} // anonymous namespace
1120
1121RegExpNode* RegExpAssertion::ToNode(RegExpCompiler* compiler,
1122 RegExpNode* on_success) {
1123 NodeInfo info;
1124 Zone* zone = compiler->zone();
1125
1126 switch (assertion_type()) {
1127 case Type::START_OF_LINE:
1128 return AssertionNode::AfterNewline(on_success);
1129 case Type::START_OF_INPUT:
1130 return AssertionNode::AtStart(on_success);
1131 case Type::BOUNDARY:
1132 return NeedsUnicodeCaseEquivalents(compiler->flags())
1133 ? BoundaryAssertionAsLookaround(compiler, on_success,
1134 Type::BOUNDARY)
1135 : AssertionNode::AtBoundary(on_success);
1136 case Type::NON_BOUNDARY:
1137 return NeedsUnicodeCaseEquivalents(compiler->flags())
1138 ? BoundaryAssertionAsLookaround(compiler, on_success,
1139 Type::NON_BOUNDARY)
1140 : AssertionNode::AtNonBoundary(on_success);
1141 case Type::END_OF_INPUT:
1142 return AssertionNode::AtEnd(on_success);
1143 case Type::END_OF_LINE: {
1144 // Compile $ in multiline regexps as an alternation with a positive
1145 // lookahead in one side and an end-of-input on the other side.
1146 // We need two registers for the lookahead.
1147 int stack_pointer_register = compiler->AllocateRegister();
1148 int position_register = compiler->AllocateRegister();
1149 // The ChoiceNode to distinguish between a newline and end-of-input.
1150 ChoiceNode* result = zone->New<ChoiceNode>(2, zone);
1151 // Create a newline atom.
1152 ZoneList<CharacterRange>* newline_ranges =
1153 zone->New<ZoneList<CharacterRange>>(3, zone);
1155 newline_ranges, false, zone);
1156 RegExpClassRanges* newline_atom =
1157 zone->New<RegExpClassRanges>(StandardCharacterSet::kLineTerminator);
1158 ActionNode* submatch_success = ActionNode::PositiveSubmatchSuccess(
1159 stack_pointer_register, position_register,
1160 0, // No captures inside.
1161 -1, // Ignored if no captures.
1162 on_success);
1163 TextNode* newline_matcher =
1164 zone->New<TextNode>(newline_atom, false, submatch_success);
1165 // Create an end-of-input matcher.
1166 RegExpNode* end_of_line = ActionNode::BeginPositiveSubmatch(
1167 stack_pointer_register, position_register, newline_matcher,
1168 submatch_success);
1169 // Add the two alternatives to the ChoiceNode.
1170 GuardedAlternative eol_alternative(end_of_line);
1171 result->AddAlternative(eol_alternative);
1172 GuardedAlternative end_alternative(AssertionNode::AtEnd(on_success));
1173 result->AddAlternative(end_alternative);
1174 return result;
1175 }
1176 default:
1177 UNREACHABLE();
1178 }
1179}
1180
1181RegExpNode* RegExpBackReference::ToNode(RegExpCompiler* compiler,
1182 RegExpNode* on_success) {
1183 RegExpNode* backref_node = on_success;
1184 // Only one of the captures in the list can actually match. Since
1185 // back-references to unmatched captures are treated as empty, we can simply
1186 // create back-references to all possible captures.
1187 for (auto capture : *captures()) {
1188 backref_node = compiler->zone()->New<BackReferenceNode>(
1189 RegExpCapture::StartRegister(capture->index()),
1190 RegExpCapture::EndRegister(capture->index()), compiler->read_backward(),
1191 backref_node);
1192 }
1193 return backref_node;
1194}
1195
1196RegExpNode* RegExpEmpty::ToNode(RegExpCompiler* compiler,
1197 RegExpNode* on_success) {
1198 return on_success;
1199}
1200
1201namespace {
1202
1203class V8_NODISCARD ModifiersScope {
1204 public:
1205 ModifiersScope(RegExpCompiler* compiler, RegExpFlags flags)
1206 : compiler_(compiler), previous_flags_(compiler->flags()) {
1207 compiler->set_flags(flags);
1208 }
1209 ~ModifiersScope() { compiler_->set_flags(previous_flags_); }
1210
1211 private:
1212 RegExpCompiler* compiler_;
1213 const RegExpFlags previous_flags_;
1214};
1215
1216} // namespace
1217
1218RegExpNode* RegExpGroup::ToNode(RegExpCompiler* compiler,
1219 RegExpNode* on_success) {
1220 // If no flags are modified, simply convert and return the body.
1221 if (flags() == compiler->flags()) {
1222 return body_->ToNode(compiler, on_success);
1223 }
1224 // Reset flags for successor node.
1225 const RegExpFlags old_flags = compiler->flags();
1226 on_success = ActionNode::ModifyFlags(old_flags, on_success);
1227
1228 // Convert body using modifier.
1229 ModifiersScope modifiers_scope(compiler, flags());
1230 RegExpNode* body = body_->ToNode(compiler, on_success);
1231
1232 // Wrap body into modifier node.
1233 RegExpNode* modified_body = ActionNode::ModifyFlags(flags(), body);
1234 return modified_body;
1235}
1236
1238 int stack_pointer_register,
1239 int position_register,
1240 int capture_register_count,
1241 int capture_register_start)
1242 : is_positive_(is_positive),
1243 on_success_(on_success),
1244 stack_pointer_register_(stack_pointer_register),
1245 position_register_(position_register) {
1246 if (is_positive_) {
1248 stack_pointer_register, position_register, capture_register_count,
1249 capture_register_start, on_success_);
1250 } else {
1251 Zone* zone = on_success_->zone();
1253 stack_pointer_register, position_register, capture_register_count,
1254 capture_register_start, zone);
1255 }
1256}
1257
1259 if (is_positive_) {
1260 ActionNode* on_match_success = on_match_success_->AsActionNode();
1262 stack_pointer_register_, position_register_, match, on_match_success);
1263 } else {
1264 Zone* zone = on_success_->zone();
1265 // We use a ChoiceNode to represent the negative lookaround. The first
1266 // alternative is the negative match. On success, the end node backtracks.
1267 // On failure, the second alternative is tried and leads to success.
1268 // NegativeLookaroundChoiceNode is a special ChoiceNode that ignores the
1269 // first exit when calculating quick checks.
1270 ChoiceNode* choice_node = zone->New<NegativeLookaroundChoiceNode>(
1271 GuardedAlternative(match), GuardedAlternative(on_success_), zone);
1272 return ActionNode::BeginNegativeSubmatch(stack_pointer_register_,
1273 position_register_, choice_node);
1274 }
1275}
1276
1278 RegExpNode* on_success) {
1279 compiler->ToNodeMaybeCheckForStackOverflow();
1280
1281 int stack_pointer_register = compiler->AllocateRegister();
1282 int position_register = compiler->AllocateRegister();
1283
1284 const int registers_per_capture = 2;
1285 const int register_of_first_capture = 2;
1286 int register_count = capture_count_ * registers_per_capture;
1287 int register_start =
1288 register_of_first_capture + capture_from_ * registers_per_capture;
1289
1291 bool was_reading_backward = compiler->read_backward();
1292 compiler->set_read_backward(type() == LOOKBEHIND);
1293 Builder builder(is_positive(), on_success, stack_pointer_register,
1294 position_register, register_count, register_start);
1295 RegExpNode* match = body_->ToNode(compiler, builder.on_match_success());
1296 result = builder.ForMatch(match);
1297 compiler->set_read_backward(was_reading_backward);
1298 return result;
1299}
1300
1302 RegExpNode* on_success) {
1303 return ToNode(body(), index(), compiler, on_success);
1304}
1305
1307 RegExpCompiler* compiler,
1308 RegExpNode* on_success) {
1310 int start_reg = RegExpCapture::StartRegister(index);
1311 int end_reg = RegExpCapture::EndRegister(index);
1312 if (compiler->read_backward()) std::swap(start_reg, end_reg);
1313 RegExpNode* store_end = ActionNode::StorePosition(end_reg, true, on_success);
1314 RegExpNode* body_node = body->ToNode(compiler, store_end);
1315 return ActionNode::StorePosition(start_reg, true, body_node);
1316}
1317
1318namespace {
1319
1320class AssertionSequenceRewriter final {
1321 public:
1322 // TODO(jgruber): Consider moving this to a separate AST tree rewriter pass
1323 // instead of sprinkling rewrites into the AST->Node conversion process.
1324 static void MaybeRewrite(ZoneList<RegExpTree*>* terms, Zone* zone) {
1325 AssertionSequenceRewriter rewriter(terms, zone);
1326
1327 static constexpr int kNoIndex = -1;
1328 int from = kNoIndex;
1329
1330 for (int i = 0; i < terms->length(); i++) {
1331 RegExpTree* t = terms->at(i);
1332 if (from == kNoIndex && t->IsAssertion()) {
1333 from = i; // Start a sequence.
1334 } else if (from != kNoIndex && !t->IsAssertion()) {
1335 // Terminate and process the sequence.
1336 if (i - from > 1) rewriter.Rewrite(from, i);
1337 from = kNoIndex;
1338 }
1339 }
1340
1341 if (from != kNoIndex && terms->length() - from > 1) {
1342 rewriter.Rewrite(from, terms->length());
1343 }
1344 }
1345
1346 // All assertions are zero width. A consecutive sequence of assertions is
1347 // order-independent. There's two ways we can optimize here:
1348 // 1. fold all identical assertions.
1349 // 2. if any assertion combinations are known to fail (e.g. \b\B), the entire
1350 // sequence fails.
1351 void Rewrite(int from, int to) {
1352 DCHECK_GT(to, from + 1);
1353
1354 // Bitfield of all seen assertions.
1355 uint32_t seen_assertions = 0;
1356 static_assert(static_cast<int>(RegExpAssertion::Type::LAST_ASSERTION_TYPE) <
1358
1359 for (int i = from; i < to; i++) {
1360 RegExpAssertion* t = terms_->at(i)->AsAssertion();
1361 const uint32_t bit = 1 << static_cast<int>(t->assertion_type());
1362
1363 if (seen_assertions & bit) {
1364 // Fold duplicates.
1365 terms_->Set(i, zone_->New<RegExpEmpty>());
1366 }
1367
1368 seen_assertions |= bit;
1369 }
1370
1371 // Collapse failures.
1372 const uint32_t always_fails_mask =
1373 1 << static_cast<int>(RegExpAssertion::Type::BOUNDARY) |
1374 1 << static_cast<int>(RegExpAssertion::Type::NON_BOUNDARY);
1375 if ((seen_assertions & always_fails_mask) == always_fails_mask) {
1376 ReplaceSequenceWithFailure(from, to);
1377 }
1378 }
1379
1380 void ReplaceSequenceWithFailure(int from, int to) {
1381 // Replace the entire sequence with a single node that always fails.
1382 // TODO(jgruber): Consider adding an explicit Fail kind. Until then, the
1383 // negated '*' (everything) range serves the purpose.
1384 ZoneList<CharacterRange>* ranges =
1385 zone_->New<ZoneList<CharacterRange>>(0, zone_);
1386 RegExpClassRanges* cc = zone_->New<RegExpClassRanges>(zone_, ranges);
1387 terms_->Set(from, cc);
1388
1389 // Zero out the rest.
1390 RegExpEmpty* empty = zone_->New<RegExpEmpty>();
1391 for (int i = from + 1; i < to; i++) terms_->Set(i, empty);
1392 }
1393
1394 private:
1395 AssertionSequenceRewriter(ZoneList<RegExpTree*>* terms, Zone* zone)
1396 : zone_(zone), terms_(terms) {}
1397
1399 ZoneList<RegExpTree*>* terms_;
1400};
1401
1402} // namespace
1403
1404RegExpNode* RegExpAlternative::ToNode(RegExpCompiler* compiler,
1405 RegExpNode* on_success) {
1406 compiler->ToNodeMaybeCheckForStackOverflow();
1407
1408 ZoneList<RegExpTree*>* children = nodes();
1409
1410 AssertionSequenceRewriter::MaybeRewrite(children, compiler->zone());
1411
1412 RegExpNode* current = on_success;
1413 if (compiler->read_backward()) {
1414 for (int i = 0; i < children->length(); i++) {
1415 current = children->at(i)->ToNode(compiler, current);
1416 }
1417 } else {
1418 for (int i = children->length() - 1; i >= 0; i--) {
1419 current = children->at(i)->ToNode(compiler, current);
1420 }
1421 }
1422 return current;
1423}
1424
1425namespace {
1426
1427void AddClass(const int* elmv, int elmc, ZoneList<CharacterRange>* ranges,
1428 Zone* zone) {
1429 elmc--;
1430 DCHECK_EQ(kRangeEndMarker, elmv[elmc]);
1431 for (int i = 0; i < elmc; i += 2) {
1432 DCHECK(elmv[i] < elmv[i + 1]);
1433 ranges->Add(CharacterRange::Range(elmv[i], elmv[i + 1] - 1), zone);
1434 }
1435}
1436
1437void AddClassNegated(const int* elmv, int elmc,
1438 ZoneList<CharacterRange>* ranges, Zone* zone) {
1439 elmc--;
1440 DCHECK_EQ(kRangeEndMarker, elmv[elmc]);
1441 DCHECK_NE(0x0000, elmv[0]);
1442 DCHECK_NE(kMaxCodePoint, elmv[elmc - 1]);
1443 base::uc16 last = 0x0000;
1444 for (int i = 0; i < elmc; i += 2) {
1445 DCHECK(last <= elmv[i] - 1);
1446 DCHECK(elmv[i] < elmv[i + 1]);
1447 ranges->Add(CharacterRange::Range(last, elmv[i] - 1), zone);
1448 last = elmv[i + 1];
1449 }
1450 ranges->Add(CharacterRange::Range(last, kMaxCodePoint), zone);
1451}
1452
1453} // namespace
1454
1457 bool add_unicode_case_equivalents,
1458 Zone* zone) {
1459 if (add_unicode_case_equivalents &&
1460 (standard_character_set == StandardCharacterSet::kWord ||
1461 standard_character_set == StandardCharacterSet::kNotWord)) {
1462 // See #sec-runtime-semantics-wordcharacters-abstract-operation
1463 // In case of unicode and ignore_case, we need to create the closure over
1464 // case equivalent characters before negating.
1465 ZoneList<CharacterRange>* new_ranges =
1466 zone->New<ZoneList<CharacterRange>>(2, zone);
1467 AddClass(kWordRanges, kWordRangeCount, new_ranges, zone);
1468 AddUnicodeCaseEquivalents(new_ranges, zone);
1469 if (standard_character_set == StandardCharacterSet::kNotWord) {
1470 ZoneList<CharacterRange>* negated =
1471 zone->New<ZoneList<CharacterRange>>(2, zone);
1472 CharacterRange::Negate(new_ranges, negated, zone);
1473 new_ranges = negated;
1474 }
1475 ranges->AddAll(*new_ranges, zone);
1476 return;
1477 }
1478
1479 switch (standard_character_set) {
1481 AddClass(kSpaceRanges, kSpaceRangeCount, ranges, zone);
1482 break;
1484 AddClassNegated(kSpaceRanges, kSpaceRangeCount, ranges, zone);
1485 break;
1487 AddClass(kWordRanges, kWordRangeCount, ranges, zone);
1488 break;
1490 AddClassNegated(kWordRanges, kWordRangeCount, ranges, zone);
1491 break;
1493 AddClass(kDigitRanges, kDigitRangeCount, ranges, zone);
1494 break;
1496 AddClassNegated(kDigitRanges, kDigitRangeCount, ranges, zone);
1497 break;
1498 // This is the set of characters matched by the $ and ^ symbols
1499 // in multiline mode.
1501 AddClass(kLineTerminatorRanges, kLineTerminatorRangeCount, ranges, zone);
1502 break;
1504 AddClassNegated(kLineTerminatorRanges, kLineTerminatorRangeCount, ranges,
1505 zone);
1506 break;
1507 // This is not a character range as defined by the spec but a
1508 // convenient shorthand for a character class that matches any
1509 // character.
1511 ranges->Add(CharacterRange::Everything(), zone);
1512 break;
1513 }
1514}
1515
1516// static
1517// Only for /i, not for /ui or /vi.
1520 bool is_one_byte) {
1522 int range_count = ranges->length();
1523#ifdef V8_INTL_SUPPORT
1524 icu::UnicodeSet others;
1525 for (int i = 0; i < range_count; i++) {
1526 CharacterRange range = ranges->at(i);
1527 base::uc32 from = range.from();
1528 if (from > kMaxUtf16CodeUnit) continue;
1529 base::uc32 to = std::min({range.to(), kMaxUtf16CodeUnitU});
1530 // Nothing to be done for surrogates.
1531 if (from >= kLeadSurrogateStart && to <= kTrailSurrogateEnd) continue;
1532 if (is_one_byte && !RangeContainsLatin1Equivalents(range)) {
1533 if (from > String::kMaxOneByteCharCode) continue;
1535 }
1536 others.add(from, to);
1537 }
1538
1539 // Compute the set of additional characters that should be added,
1540 // using UnicodeSet::closeOver. ECMA 262 defines slightly different
1541 // case-folding rules than Unicode, so some characters that are
1542 // added by closeOver do not match anything other than themselves in
1543 // JS. For example, 'Ĺż' (U+017F LATIN SMALL LETTER LONG S) is the
1544 // same case-insensitive character as 's' or 'S' according to
1545 // Unicode, but does not match any other character in JS. To handle
1546 // this case, we add such characters to the IgnoreSet and filter
1547 // them out. We filter twice: once before calling closeOver (to
1548 // prevent 'Ĺż' from adding 's'), and once after calling closeOver
1549 // (to prevent 's' from adding 'Ĺż'). See regexp/special-case.h for
1550 // more information.
1551 icu::UnicodeSet already_added(others);
1552 others.removeAll(RegExpCaseFolding::IgnoreSet());
1553 others.closeOver(USET_CASE_INSENSITIVE);
1554 others.removeAll(RegExpCaseFolding::IgnoreSet());
1555 others.removeAll(already_added);
1556
1557 // Add others to the ranges
1558 for (int32_t i = 0; i < others.getRangeCount(); i++) {
1559 UChar32 from = others.getRangeStart(i);
1560 UChar32 to = others.getRangeEnd(i);
1561 if (from == to) {
1562 ranges->Add(CharacterRange::Singleton(from), zone);
1563 } else {
1564 ranges->Add(CharacterRange::Range(from, to), zone);
1565 }
1566 }
1567#else
1568 for (int i = 0; i < range_count; i++) {
1569 CharacterRange range = ranges->at(i);
1570 base::uc32 bottom = range.from();
1571 if (bottom > kMaxUtf16CodeUnit) continue;
1572 base::uc32 top = std::min({range.to(), kMaxUtf16CodeUnitU});
1573 // Nothing to be done for surrogates.
1574 if (bottom >= kLeadSurrogateStart && top <= kTrailSurrogateEnd) continue;
1575 if (is_one_byte && !RangeContainsLatin1Equivalents(range)) {
1576 if (bottom > String::kMaxOneByteCharCode) continue;
1578 }
1580 if (top == bottom) {
1581 // If this is a singleton we just expand the one character.
1582 int length = isolate->jsregexp_uncanonicalize()->get(bottom, '\0', chars);
1583 for (int j = 0; j < length; j++) {
1584 base::uc32 chr = chars[j];
1585 if (chr != bottom) {
1586 ranges->Add(CharacterRange::Singleton(chars[j]), zone);
1587 }
1588 }
1589 } else {
1590 // If this is a range we expand the characters block by block, expanding
1591 // contiguous subranges (blocks) one at a time. The approach is as
1592 // follows. For a given start character we look up the remainder of the
1593 // block that contains it (represented by the end point), for instance we
1594 // find 'z' if the character is 'c'. A block is characterized by the
1595 // property that all characters uncanonicalize in the same way, except
1596 // that each entry in the result is incremented by the distance from the
1597 // first element. So a-z is a block because 'a' uncanonicalizes to ['a',
1598 // 'A'] and the k'th letter uncanonicalizes to ['a' + k, 'A' + k]. Once
1599 // we've found the end point we look up its uncanonicalization and
1600 // produce a range for each element. For instance for [c-f] we look up
1601 // ['z', 'Z'] and produce [c-f] and [C-F]. We then only add a range if
1602 // it is not already contained in the input, so [c-f] will be skipped but
1603 // [C-F] will be added. If this range is not completely contained in a
1604 // block we do this for all the blocks covered by the range (handling
1605 // characters that is not in a block as a "singleton block").
1607 base::uc32 pos = bottom;
1608 while (pos <= top) {
1609 int length =
1610 isolate->jsregexp_canonrange()->get(pos, '\0', equivalents);
1611 base::uc32 block_end;
1612 if (length == 0) {
1613 block_end = pos;
1614 } else {
1615 DCHECK_EQ(1, length);
1616 block_end = equivalents[0];
1617 }
1618 int end = (block_end > top) ? top : block_end;
1619 length = isolate->jsregexp_uncanonicalize()->get(block_end, '\0',
1620 equivalents);
1621 for (int j = 0; j < length; j++) {
1622 base::uc32 c = equivalents[j];
1623 base::uc32 range_from = c - (block_end - pos);
1624 base::uc32 range_to = c - (block_end - end);
1625 if (!(bottom <= range_from && range_to <= top)) {
1626 ranges->Add(CharacterRange::Range(range_from, range_to), zone);
1627 }
1628 }
1629 pos = end + 1;
1630 }
1631 }
1632 }
1633#endif // V8_INTL_SUPPORT
1634}
1635
1637 DCHECK_NOT_NULL(ranges);
1638 int n = ranges->length();
1639 if (n <= 1) return true;
1640 base::uc32 max = ranges->at(0).to();
1641 for (int i = 1; i < n; i++) {
1642 CharacterRange next_range = ranges->at(i);
1643 if (next_range.from() <= max + 1) return false;
1644 max = next_range.to();
1645 }
1646 return true;
1647}
1648
1650 if (ranges_ == nullptr) {
1651 ranges_ = zone->New<ZoneList<CharacterRange>>(2, zone);
1652 CharacterRange::AddClassEscape(standard_set_type_.value(), ranges_, false,
1653 zone);
1654 }
1655 return ranges_;
1656}
1657
1658namespace {
1659
1660// Move a number of elements in a zonelist to another position
1661// in the same list. Handles overlapping source and target areas.
1662void MoveRanges(ZoneList<CharacterRange>* list, int from, int to, int count) {
1663 // Ranges are potentially overlapping.
1664 if (from < to) {
1665 for (int i = count - 1; i >= 0; i--) {
1666 list->at(to + i) = list->at(from + i);
1667 }
1668 } else {
1669 for (int i = 0; i < count; i++) {
1670 list->at(to + i) = list->at(from + i);
1671 }
1672 }
1673}
1674
1675int InsertRangeInCanonicalList(ZoneList<CharacterRange>* list, int count,
1676 CharacterRange insert) {
1677 // Inserts a range into list[0..count[, which must be sorted
1678 // by from value and non-overlapping and non-adjacent, using at most
1679 // list[0..count] for the result. Returns the number of resulting
1680 // canonicalized ranges. Inserting a range may collapse existing ranges into
1681 // fewer ranges, so the return value can be anything in the range 1..count+1.
1682 base::uc32 from = insert.from();
1683 base::uc32 to = insert.to();
1684 int start_pos = 0;
1685 int end_pos = count;
1686 for (int i = count - 1; i >= 0; i--) {
1687 CharacterRange current = list->at(i);
1688 if (current.from() > to + 1) {
1689 end_pos = i;
1690 } else if (current.to() + 1 < from) {
1691 start_pos = i + 1;
1692 break;
1693 }
1694 }
1695
1696 // Inserted range overlaps, or is adjacent to, ranges at positions
1697 // [start_pos..end_pos[. Ranges before start_pos or at or after end_pos are
1698 // not affected by the insertion.
1699 // If start_pos == end_pos, the range must be inserted before start_pos.
1700 // if start_pos < end_pos, the entire range from start_pos to end_pos
1701 // must be merged with the insert range.
1702
1703 if (start_pos == end_pos) {
1704 // Insert between existing ranges at position start_pos.
1705 if (start_pos < count) {
1706 MoveRanges(list, start_pos, start_pos + 1, count - start_pos);
1707 }
1708 list->at(start_pos) = insert;
1709 return count + 1;
1710 }
1711 if (start_pos + 1 == end_pos) {
1712 // Replace single existing range at position start_pos.
1713 CharacterRange to_replace = list->at(start_pos);
1714 int new_from = std::min(to_replace.from(), from);
1715 int new_to = std::max(to_replace.to(), to);
1716 list->at(start_pos) = CharacterRange::Range(new_from, new_to);
1717 return count;
1718 }
1719 // Replace a number of existing ranges from start_pos to end_pos - 1.
1720 // Move the remaining ranges down.
1721
1722 int new_from = std::min(list->at(start_pos).from(), from);
1723 int new_to = std::max(list->at(end_pos - 1).to(), to);
1724 if (end_pos < count) {
1725 MoveRanges(list, end_pos, start_pos + 1, count - end_pos);
1726 }
1727 list->at(start_pos) = CharacterRange::Range(new_from, new_to);
1728 return count - (end_pos - start_pos) + 1;
1729}
1730
1731} // namespace
1732
1734 // Special/default classes are always considered canonical. The result
1735 // of calling ranges() will be sorted.
1736 if (ranges_ == nullptr) return;
1738}
1739
1740// static
1742 if (character_ranges->length() <= 1) return;
1743 // Check whether ranges are already canonical (increasing, non-overlapping,
1744 // non-adjacent).
1745 int n = character_ranges->length();
1746 base::uc32 max = character_ranges->at(0).to();
1747 int i = 1;
1748 while (i < n) {
1749 CharacterRange current = character_ranges->at(i);
1750 if (current.from() <= max + 1) {
1751 break;
1752 }
1753 max = current.to();
1754 i++;
1755 }
1756 // Canonical until the i'th range. If that's all of them, we are done.
1757 if (i == n) return;
1758
1759 // The ranges at index i and forward are not canonicalized. Make them so by
1760 // doing the equivalent of insertion sort (inserting each into the previous
1761 // list, in order).
1762 // Notice that inserting a range can reduce the number of ranges in the
1763 // result due to combining of adjacent and overlapping ranges.
1764 int read = i; // Range to insert.
1765 int num_canonical = i; // Length of canonicalized part of list.
1766 do {
1767 num_canonical = InsertRangeInCanonicalList(character_ranges, num_canonical,
1768 character_ranges->at(read));
1769 read++;
1770 } while (read < n);
1771 character_ranges->Rewind(num_canonical);
1772
1773 DCHECK(CharacterRange::IsCanonical(character_ranges));
1774}
1775
1776// static
1778 ZoneList<CharacterRange>* negated_ranges,
1779 Zone* zone) {
1781 DCHECK_EQ(0, negated_ranges->length());
1782 int range_count = ranges->length();
1783 base::uc32 from = 0;
1784 int i = 0;
1785 if (range_count > 0 && ranges->at(0).from() == 0) {
1786 from = ranges->at(0).to() + 1;
1787 i = 1;
1788 }
1789 while (i < range_count) {
1790 CharacterRange range = ranges->at(i);
1791 negated_ranges->Add(CharacterRange::Range(from, range.from() - 1), zone);
1792 from = range.to() + 1;
1793 i++;
1794 }
1795 if (from < kMaxCodePoint) {
1796 negated_ranges->Add(CharacterRange::Range(from, kMaxCodePoint), zone);
1797 }
1798}
1799
1800// static
1802 const ZoneList<CharacterRange>* rhs,
1803 ZoneList<CharacterRange>* intersection,
1804 Zone* zone) {
1807 DCHECK_EQ(0, intersection->length());
1808 int lhs_index = 0;
1809 int rhs_index = 0;
1810 while (lhs_index < lhs->length() && rhs_index < rhs->length()) {
1811 // Skip non-overlapping ranges.
1812 if (lhs->at(lhs_index).to() < rhs->at(rhs_index).from()) {
1813 lhs_index++;
1814 continue;
1815 }
1816 if (rhs->at(rhs_index).to() < lhs->at(lhs_index).from()) {
1817 rhs_index++;
1818 continue;
1819 }
1820
1821 base::uc32 from =
1822 std::max(lhs->at(lhs_index).from(), rhs->at(rhs_index).from());
1823 base::uc32 to = std::min(lhs->at(lhs_index).to(), rhs->at(rhs_index).to());
1824 intersection->Add(CharacterRange::Range(from, to), zone);
1825 if (to == lhs->at(lhs_index).to()) {
1826 lhs_index++;
1827 } else {
1828 rhs_index++;
1829 }
1830 }
1831
1832 DCHECK(IsCanonical(intersection));
1833}
1834
1835namespace {
1836
1837// Advance |index| and set |from| and |to| to the new range, if not out of
1838// bounds of |range|, otherwise |from| is set to a code point beyond the legal
1839// unicode character range.
1840void SafeAdvanceRange(const ZoneList<CharacterRange>* range, int* index,
1841 base::uc32* from, base::uc32* to) {
1842 ++(*index);
1843 if (*index < range->length()) {
1844 *from = range->at(*index).from();
1845 *to = range->at(*index).to();
1846 } else {
1847 *from = kMaxCodePoint + 1;
1848 }
1849}
1850
1851} // namespace
1852
1853// static
1855 const ZoneList<CharacterRange>* to_remove,
1859 DCHECK_EQ(0, result->length());
1860
1861 if (src->is_empty()) return;
1862
1863 int src_index = 0;
1864 int to_remove_index = 0;
1865 base::uc32 from = src->at(src_index).from();
1866 base::uc32 to = src->at(src_index).to();
1867 while (src_index < src->length() && to_remove_index < to_remove->length()) {
1868 CharacterRange remove_range = to_remove->at(to_remove_index);
1869 if (remove_range.to() < from) {
1870 // (a) Non-overlapping case, ignore current to_remove range.
1871 // |-------|
1872 // |-------|
1873 to_remove_index++;
1874 } else if (to < remove_range.from()) {
1875 // (b) Non-overlapping case, add full current range to result.
1876 // |-------|
1877 // |-------|
1878 result->Add(CharacterRange::Range(from, to), zone);
1879 SafeAdvanceRange(src, &src_index, &from, &to);
1880 } else if (from >= remove_range.from() && to <= remove_range.to()) {
1881 // (c) Current to_remove range fully covers current range.
1882 // |---|
1883 // |-------|
1884 SafeAdvanceRange(src, &src_index, &from, &to);
1885 } else if (from < remove_range.from() && to > remove_range.to()) {
1886 // (d) Split current range.
1887 // |-------|
1888 // |---|
1889 result->Add(CharacterRange::Range(from, remove_range.from() - 1), zone);
1890 from = remove_range.to() + 1;
1891 to_remove_index++;
1892 } else if (from < remove_range.from()) {
1893 // (e) End current range.
1894 // |-------|
1895 // |-------|
1896 to = remove_range.from() - 1;
1897 result->Add(CharacterRange::Range(from, to), zone);
1898 SafeAdvanceRange(src, &src_index, &from, &to);
1899 } else if (to > remove_range.to()) {
1900 // (f) Modify start of current range.
1901 // |-------|
1902 // |-------|
1903 from = remove_range.to() + 1;
1904 to_remove_index++;
1905 } else {
1906 UNREACHABLE();
1907 }
1908 }
1909 // The last range needs special treatment after |to_remove| is exhausted, as
1910 // |from| might have been modified by the last |to_remove| range and |to| was
1911 // not yet known (i.e. cases d and f).
1912 if (from <= to) {
1913 result->Add(CharacterRange::Range(from, to), zone);
1914 }
1915 src_index++;
1916
1917 // Add remaining ranges after |to_remove| is exhausted.
1918 for (; src_index < src->length(); src_index++) {
1919 result->Add(src->at(src_index), zone);
1920 }
1921
1922 DCHECK(IsCanonical(result));
1923}
1924
1925// static
1927 DCHECK(IsCanonical(ranges));
1928
1929 // Drop all ranges that don't contain one-byte code units, and clamp the last
1930 // range s.t. it likewise only contains one-byte code units. Note this relies
1931 // on `ranges` being canonicalized, i.e. sorted and non-overlapping.
1932
1933 static constexpr base::uc32 max_char = String::kMaxOneByteCharCodeU;
1934 int n = ranges->length();
1935 for (; n > 0; n--) {
1936 CharacterRange& r = ranges->at(n - 1);
1937 if (r.from() <= max_char) {
1938 r.to_ = std::min(r.to_, max_char);
1939 break;
1940 }
1941 }
1942
1943 ranges->Rewind(n);
1944}
1945
1946// static
1948 const ZoneList<CharacterRange>* rhs) {
1949 DCHECK(IsCanonical(lhs));
1950 DCHECK(IsCanonical(rhs));
1951 if (lhs->length() != rhs->length()) return false;
1952
1953 for (int i = 0; i < lhs->length(); i++) {
1954 if (lhs->at(i) != rhs->at(i)) return false;
1955 }
1956
1957 return true;
1958}
1959
1960namespace {
1961
1962// Scoped object to keep track of how much we unroll quantifier loops in the
1963// regexp graph generator.
1964class RegExpExpansionLimiter {
1965 public:
1966 static const int kMaxExpansionFactor = 6;
1967 RegExpExpansionLimiter(RegExpCompiler* compiler, int factor)
1968 : compiler_(compiler),
1969 saved_expansion_factor_(compiler->current_expansion_factor()),
1971 DCHECK_LT(0, factor);
1972 if (ok_to_expand_) {
1973 if (factor > kMaxExpansionFactor) {
1974 // Avoid integer overflow of the current expansion factor.
1975 ok_to_expand_ = false;
1976 compiler->set_current_expansion_factor(kMaxExpansionFactor + 1);
1977 } else {
1978 int new_factor = saved_expansion_factor_ * factor;
1979 ok_to_expand_ = (new_factor <= kMaxExpansionFactor);
1980 compiler->set_current_expansion_factor(new_factor);
1981 }
1982 }
1983 }
1984
1985 ~RegExpExpansionLimiter() {
1986 compiler_->set_current_expansion_factor(saved_expansion_factor_);
1987 }
1988
1989 bool ok_to_expand() { return ok_to_expand_; }
1990
1991 private:
1992 RegExpCompiler* compiler_;
1995
1996 DISALLOW_IMPLICIT_CONSTRUCTORS(RegExpExpansionLimiter);
1997};
1998
1999} // namespace
2000
2001RegExpNode* RegExpQuantifier::ToNode(int min, int max, bool is_greedy,
2002 RegExpTree* body, RegExpCompiler* compiler,
2003 RegExpNode* on_success,
2004 bool not_at_start) {
2005 // x{f, t} becomes this:
2006 //
2007 // (r++)<-.
2008 // | `
2009 // | (x)
2010 // v ^
2011 // (r=0)-->(?)---/ [if r < t]
2012 // |
2013 // [if r >= f] \----> ...
2014 //
2015
2016 // 15.10.2.5 RepeatMatcher algorithm.
2017 // The parser has already eliminated the case where max is 0. In the case
2018 // where max_match is zero the parser has removed the quantifier if min was
2019 // > 0 and removed the atom if min was 0. See AddQuantifierToAtom.
2020
2021 // If we know that we cannot match zero length then things are a little
2022 // simpler since we don't need to make the special zero length match check
2023 // from step 2.1. If the min and max are small we can unroll a little in
2024 // this case.
2025 static const int kMaxUnrolledMinMatches = 3; // Unroll (foo)+ and (foo){3,}
2026 static const int kMaxUnrolledMaxMatches = 3; // Unroll (foo)? and (foo){x,3}
2027 if (max == 0) return on_success; // This can happen due to recursion.
2028 bool body_can_be_empty = (body->min_match() == 0);
2029 int body_start_reg = RegExpCompiler::kNoRegister;
2030 Interval capture_registers = body->CaptureRegisters();
2031 bool needs_capture_clearing = !capture_registers.is_empty();
2032 Zone* zone = compiler->zone();
2033
2034 if (body_can_be_empty) {
2035 body_start_reg = compiler->AllocateRegister();
2036 } else if (compiler->optimize() && !needs_capture_clearing) {
2037 // Only unroll if there are no captures and the body can't be
2038 // empty.
2039 {
2040 RegExpExpansionLimiter limiter(compiler, min + ((max != min) ? 1 : 0));
2041 if (min > 0 && min <= kMaxUnrolledMinMatches && limiter.ok_to_expand()) {
2042 int new_max = (max == kInfinity) ? max : max - min;
2043 // Recurse once to get the loop or optional matches after the fixed
2044 // ones.
2045 RegExpNode* answer =
2046 ToNode(0, new_max, is_greedy, body, compiler, on_success, true);
2047 // Unroll the forced matches from 0 to min. This can cause chains of
2048 // TextNodes (which the parser does not generate). These should be
2049 // combined if it turns out they hinder good code generation.
2050 for (int i = 0; i < min; i++) {
2051 answer = body->ToNode(compiler, answer);
2052 }
2053 return answer;
2054 }
2055 }
2056 if (max <= kMaxUnrolledMaxMatches && min == 0) {
2057 DCHECK_LT(0, max); // Due to the 'if' above.
2058 RegExpExpansionLimiter limiter(compiler, max);
2059 if (limiter.ok_to_expand()) {
2060 // Unroll the optional matches up to max.
2061 RegExpNode* answer = on_success;
2062 for (int i = 0; i < max; i++) {
2063 ChoiceNode* alternation = zone->New<ChoiceNode>(2, zone);
2064 if (is_greedy) {
2065 alternation->AddAlternative(
2066 GuardedAlternative(body->ToNode(compiler, answer)));
2067 alternation->AddAlternative(GuardedAlternative(on_success));
2068 } else {
2069 alternation->AddAlternative(GuardedAlternative(on_success));
2070 alternation->AddAlternative(
2071 GuardedAlternative(body->ToNode(compiler, answer)));
2072 }
2073 answer = alternation;
2074 if (not_at_start && !compiler->read_backward()) {
2075 alternation->set_not_at_start();
2076 }
2077 }
2078 return answer;
2079 }
2080 }
2081 }
2082 bool has_min = min > 0;
2083 bool has_max = max < RegExpTree::kInfinity;
2084 bool needs_counter = has_min || has_max;
2085 int reg_ctr = needs_counter ? compiler->AllocateRegister()
2087 LoopChoiceNode* center = zone->New<LoopChoiceNode>(
2088 body->min_match() == 0, compiler->read_backward(), min, zone);
2089 if (not_at_start && !compiler->read_backward()) center->set_not_at_start();
2090 RegExpNode* loop_return =
2091 needs_counter ? static_cast<RegExpNode*>(
2092 ActionNode::IncrementRegister(reg_ctr, center))
2093 : static_cast<RegExpNode*>(center);
2094 if (body_can_be_empty) {
2095 // If the body can be empty we need to check if it was and then
2096 // backtrack.
2097 loop_return =
2098 ActionNode::EmptyMatchCheck(body_start_reg, reg_ctr, min, loop_return);
2099 }
2100 RegExpNode* body_node = body->ToNode(compiler, loop_return);
2101 if (body_can_be_empty) {
2102 // If the body can be empty we need to store the start position
2103 // so we can bail out if it was empty.
2104 body_node = ActionNode::StorePosition(body_start_reg, false, body_node);
2105 }
2106 if (needs_capture_clearing) {
2107 // Before entering the body of this loop we need to clear captures.
2108 body_node = ActionNode::ClearCaptures(capture_registers, body_node);
2109 }
2110 GuardedAlternative body_alt(body_node);
2111 if (has_max) {
2112 Guard* body_guard = zone->New<Guard>(reg_ctr, Guard::LT, max);
2113 body_alt.AddGuard(body_guard, zone);
2114 }
2115 GuardedAlternative rest_alt(on_success);
2116 if (has_min) {
2117 Guard* rest_guard = compiler->zone()->New<Guard>(reg_ctr, Guard::GEQ, min);
2118 rest_alt.AddGuard(rest_guard, zone);
2119 }
2120 if (is_greedy) {
2121 center->AddLoopAlternative(body_alt);
2122 center->AddContinueAlternative(rest_alt);
2123 } else {
2124 center->AddContinueAlternative(rest_alt);
2125 center->AddLoopAlternative(body_alt);
2126 }
2127 if (needs_counter) {
2128 return ActionNode::SetRegisterForLoop(reg_ctr, 0, center);
2129 } else {
2130 return center;
2131 }
2132}
2133
2134} // namespace internal
2135} // namespace v8
friend Zone
Definition asm-types.cc:195
ThreadLocalTop * top
SourcePosition pos
int get(uchar c, uchar n, uchar *result)
Definition unicode-inl.h:32
static uint16_t LeadSurrogate(uint32_t char_code)
Definition unicode.h:126
static uint16_t TrailSurrogate(uint32_t char_code)
Definition unicode.h:129
static bool IsTrailSurrogate(int code)
Definition unicode.h:109
static bool IsLeadSurrogate(int code)
Definition unicode.h:106
size_t size() const
void emplace_back(Args &&... args)
T & at(size_t index)
const T & at(size_t index) const
Definition vector.h:81
ActionNode * AsActionNode() override
static ActionNode * EmptyMatchCheck(int start_register, int repetition_register, int repetition_limit, RegExpNode *on_success)
static ActionNode * ModifyFlags(RegExpFlags flags, RegExpNode *on_success)
static ActionNode * StorePosition(int reg, bool is_capture, RegExpNode *on_success)
static ActionNode * PositiveSubmatchSuccess(int stack_pointer_reg, int restore_reg, int clear_capture_count, int clear_capture_from, RegExpNode *on_success)
static ActionNode * BeginNegativeSubmatch(int stack_pointer_reg, int position_reg, RegExpNode *on_success)
static ActionNode * SetRegisterForLoop(int reg, int val, RegExpNode *on_success)
static ActionNode * IncrementRegister(int reg, RegExpNode *on_success)
static ActionNode * BeginPositiveSubmatch(int stack_pointer_reg, int position_reg, RegExpNode *body, ActionNode *success_node)
static ActionNode * ClearCaptures(Interval range, RegExpNode *on_success)
static AssertionNode * AtStart(RegExpNode *on_success)
static AssertionNode * AtEnd(RegExpNode *on_success)
static AssertionNode * AfterNewline(RegExpNode *on_success)
static bool Equals(const ZoneList< CharacterRange > *lhs, const ZoneList< CharacterRange > *rhs)
static void Canonicalize(ZoneList< CharacterRange > *ranges)
static void AddUnicodeCaseEquivalents(ZoneList< CharacterRange > *ranges, Zone *zone)
base::uc32 from() const
Definition regexp-ast.h:140
static void Subtract(const ZoneList< CharacterRange > *src, const ZoneList< CharacterRange > *to_remove, ZoneList< CharacterRange > *dst, Zone *zone)
static void Negate(const ZoneList< CharacterRange > *src, ZoneList< CharacterRange > *dst, Zone *zone)
base::uc32 to() const
Definition regexp-ast.h:141
static V8_EXPORT_PRIVATE void AddCaseEquivalents(Isolate *isolate, Zone *zone, ZoneList< CharacterRange > *ranges, bool is_one_byte)
static void ClampToOneByte(ZoneList< CharacterRange > *ranges)
static CharacterRange Singleton(base::uc32 value)
Definition regexp-ast.h:102
static V8_EXPORT_PRIVATE bool IsCanonical(const ZoneList< CharacterRange > *ranges)
static ZoneList< CharacterRange > * List(Zone *zone, CharacterRange range)
Definition regexp-ast.h:114
static void Intersect(const ZoneList< CharacterRange > *lhs, const ZoneList< CharacterRange > *rhs, ZoneList< CharacterRange > *dst, Zone *zone)
static CharacterRange Range(base::uc32 from, base::uc32 to)
Definition regexp-ast.h:105
static CharacterRange Everything()
Definition regexp-ast.h:110
static V8_EXPORT_PRIVATE void AddClassEscape(StandardCharacterSet standard_character_set, ZoneList< CharacterRange > *ranges, bool add_unicode_case_equivalents, Zone *zone)
void set_standard_set_type(StandardCharacterSet standard_set_type)
Definition regexp-ast.h:294
V8_EXPORT_PRIVATE void Canonicalize()
ZoneList< CharacterRange > * ranges(Zone *zone)
base::Vector< const base::uc16 > data() const
Definition regexp-ast.h:483
static int StartRegister(int index)
Definition regexp-ast.h:621
static RegExpNode * ToNode(RegExpTree *body, int index, RegExpCompiler *compiler, RegExpNode *on_success)
static int EndRegister(int index)
Definition regexp-ast.h:622
StandardCharacterSet standard_type() const
Definition regexp-ast.h:348
ZoneList< CharacterRange > * ranges(Zone *zone)
Definition regexp-ast.h:353
RegExpClassRanges(Zone *zone, ZoneList< CharacterRange > *ranges, ClassRangesFlags class_ranges_flags=ClassRangesFlags())
Definition regexp-ast.h:320
static RegExpClassSetOperand * ComputeExpression(RegExpTree *root, ZoneList< CharacterRange > *temp_ranges, Zone *zone)
CharacterClassStrings * strings_
Definition regexp-ast.h:424
ZoneList< CharacterRange > * ranges()
Definition regexp-ast.h:416
void Intersect(RegExpClassSetOperand *other, ZoneList< CharacterRange > *temp_ranges, Zone *zone)
CharacterClassStrings * strings()
Definition regexp-ast.h:417
void Union(RegExpClassSetOperand *other, Zone *zone)
void Subtract(RegExpClassSetOperand *other, ZoneList< CharacterRange > *temp_ranges, Zone *zone)
ZoneList< RegExpTree * > * alternatives() const
Definition regexp-ast.h:229
void RationalizeConsecutiveAtoms(RegExpCompiler *compiler)
void FixSingleCharacterDisjunctions(RegExpCompiler *compiler)
bool SortConsecutiveAtoms(RegExpCompiler *compiler)
Builder(bool is_positive, RegExpNode *on_success, int stack_pointer_register, int position_register, int capture_register_count=0, int capture_register_start=0)
RegExpTree * body() const
Definition regexp-ast.h:676
static RegExpNode * ToNode(int min, int max, bool is_greedy, RegExpTree *body, RegExpCompiler *compiler, RegExpNode *on_success, bool not_at_start=false)
ZoneList< TextElement > * elements()
Definition regexp-ast.h:538
virtual RegExpNode * ToNode(RegExpCompiler *compiler, RegExpNode *on_success)=0
virtual int min_match()=0
static const int kInfinity
Definition regexp-ast.h:196
virtual Interval CaptureRegisters()
Definition regexp-ast.h:208
static const uint32_t kMaxOneByteCharCodeU
Definition string.h:501
static const int32_t kMaxOneByteCharCode
Definition string.h:500
static TextElement Atom(RegExpAtom *atom)
static TextNode * CreateForSurrogatePair(Zone *zone, CharacterRange lead, ZoneList< CharacterRange > *trail_ranges, bool read_backward, RegExpNode *on_success)
static TextNode * CreateForCharacterRanges(Zone *zone, ZoneList< CharacterRange > *ranges, bool read_backward, RegExpNode *on_success)
V8_EXPORT_PRIVATE UnicodeRangeSplitter(ZoneList< CharacterRange > *base)
V8_INLINE int length() const
Definition zone-list.h:101
T & at(int i) const
Definition zone-list.h:88
V8_INLINE void Rewind(int pos)
V8_INLINE bool is_empty() const
Definition zone-list.h:100
void Add(const T &element, Zone *zone)
T * New(Args &&... args)
Definition zone.h:114
Zone * zone_
bool is_empty
Definition sweeper.cc:229
uint32_t count
Handle< SharedFunctionInfo > info
int end
LineAndColumn current
std::vector< std::unique_ptr< InstanceTypeTree > > children
std::optional< TNode< JSArray > > a
double second
ZoneVector< RpoNumber > & result
Point to
int n
Definition mul-fft.cc:296
int r
Definition mul-fft.cc:298
unsigned int uchar
Definition unicode.h:21
uint32_t uc32
Definition strings.h:19
uint16_t uc16
Definition strings.h:18
static const base::uc32 kNonBmpEnd
static const base::uc32 kNonBmpStart
constexpr int kBitsPerByte
Definition globals.h:682
static const base::uc32 kTrailSurrogateStart
constexpr bool IsEitherUnicode(RegExpFlags f)
bool NeedsUnicodeCaseEquivalents(RegExpFlags flags)
Flag flags[]
Definition flags.cc:3797
constexpr uint32_t kMaxUtf16CodeUnitU
constexpr base::uc32 kMaxCodePoint
base::Flags< RegExpFlag > RegExpFlags
static const base::uc32 kTrailSurrogateEnd
constexpr int kUInt32Size
Definition globals.h:403
static const base::uc32 kLeadSurrogateStart
static const base::uc32 kLeadSurrogateEnd
constexpr int kMaxUtf16CodeUnit
bool RangeContainsLatin1Equivalents(CharacterRange range)
bool ok_to_expand_
const RegExpFlags previous_flags_
ZoneList< RegExpTree * > * terms_
RegExpCompiler * compiler_
static const int kMaxExpansionFactor
int saved_expansion_factor_
#define DCHECK_LE(v1, v2)
Definition logging.h:490
#define CHECK(condition)
Definition logging.h:124
#define DCHECK_NOT_NULL(val)
Definition logging.h:492
#define DCHECK_IMPLIES(v1, v2)
Definition logging.h:493
#define DCHECK_NE(v1, v2)
Definition logging.h:486
#define DCHECK(condition)
Definition logging.h:482
#define DCHECK_LT(v1, v2)
Definition logging.h:489
#define DCHECK_EQ(v1, v2)
Definition logging.h:485
#define DCHECK_GT(v1, v2)
Definition logging.h:487
#define arraysize(array)
Definition macros.h:67
#define DISALLOW_IMPLICIT_CONSTRUCTORS(TypeName)
Definition macros.h:130
static const int kMaxWidth
Definition unicode.h:294
static const int kMaxWidth
Definition unicode.h:298
#define V8_NODISCARD
Definition v8config.h:693
const wasm::FunctionBody & body_