v8
V8 is Google’s open source high-performance JavaScript and WebAssembly engine, written in C++.
Loading...
Searching...
No Matches
regexp-compiler.h
Go to the documentation of this file.
1// Copyright 2019 the V8 project authors. All rights reserved.
2// Use of this source code is governed by a BSD-style license that can be
3// found in the LICENSE file.
4
5#ifndef V8_REGEXP_REGEXP_COMPILER_H_
6#define V8_REGEXP_REGEXP_COMPILER_H_
7
8#include <bitset>
9
11#include "src/base/strings.h"
14
15namespace v8 {
16namespace internal {
17
18class DynamicBitSet;
19class Isolate;
20
21namespace regexp_compiler_constants {
22
23// The '2' variant is has inclusive from and exclusive to.
24// This covers \s as defined in ECMA-262 5.1, 15.10.2.12,
25// which include WhiteSpace (7.2) or LineTerminator (7.3) values.
26constexpr base::uc32 kRangeEndMarker = 0x110000;
27constexpr int kSpaceRanges[] = {
28 '\t', '\r' + 1, ' ', ' ' + 1, 0x00A0, 0x00A1, 0x1680,
29 0x1681, 0x2000, 0x200B, 0x2028, 0x202A, 0x202F, 0x2030,
30 0x205F, 0x2060, 0x3000, 0x3001, 0xFEFF, 0xFF00, kRangeEndMarker};
32
33constexpr int kWordRanges[] = {'0', '9' + 1, 'A', 'Z' + 1, '_',
34 '_' + 1, 'a', 'z' + 1, kRangeEndMarker};
36constexpr int kDigitRanges[] = {'0', '9' + 1, kRangeEndMarker};
41constexpr int kLineTerminatorRanges[] = {0x000A, 0x000B, 0x000D, 0x000E,
42 0x2028, 0x202A, kRangeEndMarker};
44
45// More makes code generation slower, less makes V8 benchmark score lower.
46constexpr uint32_t kMaxLookaheadForBoyerMoore = 8;
47// In a 3-character pattern you can maximally step forwards 3 characters
48// at a time, which is not always enough to pay for the extra logic.
49constexpr uint32_t kPatternTooShortForBoyerMoore = 2;
50
51} // namespace regexp_compiler_constants
52
54 // Both unicode (or unicode sets) and ignore_case flags are set. We need to
55 // use ICU to find the closure over case equivalents.
56 return IsEitherUnicode(flags) && IsIgnoreCase(flags);
57}
58
59// Details of a quick mask-compare check that can look ahead in the
60// input stream.
62 public:
67 bool Rationalize(bool one_byte);
68 // Merge in the information from another branch of an alternation.
69 void Merge(QuickCheckDetails* other, int from_index);
70 // Advance the current position by some amount.
71 void Advance(int by, bool one_byte);
72 void Clear();
73 bool cannot_match() { return cannot_match_; }
81 int characters() { return characters_; }
83 Position* positions(int index) {
84 DCHECK_LE(0, index);
85 DCHECK_GT(characters_, index);
86 return positions_ + index;
87 }
88 uint32_t mask() { return mask_; }
89 uint32_t value() { return value_; }
90
91 private:
92 // How many characters do we have quick check information from. This is
93 // the same for all branches of a choice node.
96 // These values are the condensate of the above array after Rationalize().
97 uint32_t mask_;
98 uint32_t value_;
99 // If set to true, there is no way this quick check can match at all.
100 // E.g., if it requires to be at the start of the input, and isn't.
102};
103
104// Improve the speed that we scan for an initial point where a non-anchored
105// regexp can match by using a Boyer-Moore-like table. This is done by
106// identifying non-greedy non-capturing loops in the nodes that eat any
107// character one at a time. For example in the middle of the regexp
108// /foo[\s\S]*?bar/ we find such a loop. There is also such a loop implicitly
109// inserted at the start of any non-anchored regexp.
110//
111// When we have found such a loop we look ahead in the nodes to find the set of
112// characters that can come at given distances. For example for the regexp
113// /.?foo/ we know that there are at least 3 characters ahead of us, and the
114// sets of characters that can occur are [any, [f, o], [o]]. We find a range in
115// the lookahead info where the set of characters is reasonably constrained. In
116// our example this is from index 1 to 2 (0 is not constrained). We can now
117// look 3 characters ahead and if we don't find one of [f, o] (the union of
118// [f, o] and [o]) then we can skip forwards by the range size (in this case 2).
119//
120// For Unicode input strings we do the same, but modulo 128.
121//
122// We also look at the first string fed to the regexp and use that to get a hint
123// of the character frequencies in the inputs. This affects the assessment of
124// whether the set of characters is 'reasonably constrained'.
125//
126// We also have another lookahead mechanism (called quick check in the code),
127// which uses a wide load of multiple characters followed by a mask and compare
128// to determine whether a match is possible at this point.
133 kLatticeUnknown = 3 // Can also mean both in and out.
135
137 return static_cast<ContainedInLattice>(a | b);
138}
139
141 public:
142 bool at(int i) const { return map_[i]; }
143
144 static constexpr int kMapSize = 128;
145 static constexpr int kMask = kMapSize - 1;
146
147 int map_count() const { return map_count_; }
148
149 void Set(int character);
150 void SetInterval(const Interval& interval);
151 void SetAll();
152
153 bool is_non_word() { return w_ == kLatticeOut; }
154 bool is_word() { return w_ == kLatticeIn; }
155
156 using Bitset = std::bitset<kMapSize>;
157 Bitset raw_bitset() const { return map_; }
158
159 private:
161 int map_count_ = 0; // Number of set bits in the map.
162 ContainedInLattice w_ = kNotYet; // The \w character class.
163};
164
166 public:
167 BoyerMooreLookahead(int length, RegExpCompiler* compiler, Zone* zone);
168
169 int length() { return length_; }
170 int max_char() { return max_char_; }
172
173 int Count(int map_number) { return bitmaps_->at(map_number)->map_count(); }
174
176
177 void Set(int map_number, int character) {
178 if (character > max_char_) return;
179 BoyerMoorePositionInfo* info = bitmaps_->at(map_number);
180 info->Set(character);
181 }
182
183 void SetInterval(int map_number, const Interval& interval) {
184 if (interval.from() > max_char_) return;
185 BoyerMoorePositionInfo* info = bitmaps_->at(map_number);
186 if (interval.to() > max_char_) {
187 info->SetInterval(Interval(interval.from(), max_char_));
188 } else {
189 info->SetInterval(interval);
190 }
191 }
192
193 void SetAll(int map_number) { bitmaps_->at(map_number)->SetAll(); }
194
195 void SetRest(int from_map) {
196 for (int i = from_map; i < length_; i++) SetAll(i);
197 }
199
200 private:
201 // This is the value obtained by EatsAtLeast. If we do not have at least this
202 // many characters left in the sample string then the match is bound to fail.
203 // Therefore it is OK to read a character this far ahead of the current match
204 // point.
207 // 0xff for Latin1, 0xffff for UTF-16.
210
211 int GetSkipTable(
212 int min_lookahead, int max_lookahead,
213 DirectHandle<ByteArray> boolean_skip_table,
215 bool FindWorthwhileInterval(int* from, int* to);
216 int FindBestInterval(int max_number_of_chars, int old_biggest_points,
217 int* from, int* to);
218};
219
220// There are many ways to generate code for a node. This class encapsulates
221// the current way we should be generating. In other words it encapsulates
222// the current state of the code generator. The effect of this is that we
223// generate code for paths that the matcher can take through the regular
224// expression. A given node in the regexp can be code-generated several times
225// as it can be part of several traces. For example for the regexp:
226// /foo(bar|ip)baz/ the code to match baz will be generated twice, once as part
227// of the foo-bar-baz trace and once as part of the foo-ip-baz trace. The code
228// to match foo is generated only once (the traces have a common prefix). The
229// code to store the capture is deferred and generated (twice) after the places
230// where baz has been matched.
231class Trace {
232 public:
233 // A value for a property that is either known to be true, know to be false,
234 // or not known.
235 enum TriBool { UNKNOWN = -1, FALSE_VALUE = 0, TRUE_VALUE = 1 };
236
252
254 public:
256 : DeferredAction(ActionNode::STORE_POSITION, reg),
257 cp_offset_(trace->cp_offset()),
259 int cp_offset() { return cp_offset_; }
260 bool is_capture() { return is_capture_; }
261
262 private:
266 };
267
269 public:
271 : DeferredAction(ActionNode::SET_REGISTER_FOR_LOOP, reg),
272 value_(value) {}
273 int value() { return value_; }
274
275 private:
277 };
278
280 public:
282 : DeferredAction(ActionNode::CLEAR_CAPTURES, -1), range_(range) {}
283 Interval range() { return range_; }
284
285 private:
287 };
288
290 public:
292 : DeferredAction(ActionNode::INCREMENT_REGISTER, reg) {}
293 };
294
296 : cp_offset_(0),
297 actions_(nullptr),
298 backtrack_(nullptr),
299 stop_node_(nullptr),
300 loop_label_(nullptr),
303 flush_budget_(100),
305
306 // End the trace. This involves flushing the deferred actions in the trace
307 // and pushing a backtrack location onto the backtrack stack. Once this is
308 // done we can start a new trace or go to one that has already been
309 // generated.
310 void Flush(RegExpCompiler* compiler, RegExpNode* successor);
311 int cp_offset() { return cp_offset_; }
313 // A trivial trace is one that has no deferred actions or other state that
314 // affects the assumptions used when generating code. There is no recorded
315 // backtrack location in a trivial trace, so with a trivial trace we will
316 // generate code that, on a failure to match, gets the backtrack location
317 // from the backtrack stack rather than using a direct jump instruction. We
318 // always start code generation with a trivial trace and non-trivial traces
319 // are created as we emit code for nodes or add to the list of deferred
320 // actions in the trace. The location of the code generated for a node using
321 // a trivial trace is recorded in a label in the node so that gotos can be
322 // generated to that code.
323 bool is_trivial() {
324 return backtrack_ == nullptr && actions_ == nullptr && cp_offset_ == 0 &&
327 }
335 int flush_budget() { return flush_budget_; }
337 bool mentions_reg(int reg);
338 // Returns true if a deferred position store exists to the specified
339 // register and stores the offset in the out-parameter. Otherwise
340 // returns false.
341 bool GetStoredPosition(int reg, int* cp_offset);
342 // These set methods and AdvanceCurrentPositionInTrace should be used only on
343 // new traces - the intention is that traces are immutable after creation.
344 void add_action(DeferredAction* new_action) {
345 DCHECK(new_action->next_ == nullptr);
346 new_action->next_ = actions_;
347 actions_ = new_action;
348 }
359 void AdvanceCurrentPositionInTrace(int by, RegExpCompiler* compiler);
360
361 private:
362 int FindAffectedRegisters(DynamicBitSet* affected_registers, Zone* zone);
363 void PerformDeferredActions(RegExpMacroAssembler* macro, int max_register,
364 const DynamicBitSet& affected_registers,
365 DynamicBitSet* registers_to_pop,
366 DynamicBitSet* registers_to_clear, Zone* zone);
367 void RestoreAffectedRegisters(RegExpMacroAssembler* macro, int max_register,
368 const DynamicBitSet& registers_to_pop,
369 const DynamicBitSet& registers_to_clear);
380};
381
383 public:
384 explicit GreedyLoopState(bool not_at_start);
385
386 Label* label() { return &label_; }
388
389 private:
392};
393
402
403// Analysis performs assertion propagation and computes eats_at_least_ values.
404// See the comments on AssertionPropagator and EatsAtLeastPropagator for more
405// details.
406RegExpError AnalyzeRegExp(Isolate* isolate, bool is_one_byte, RegExpFlags flags,
407 RegExpNode* node);
408
410 public:
416
422
423 // Does not measure in percent, but rather per-128 (the table size from the
424 // regexp macro assembler).
425 int Frequency(int in_character) {
426 DCHECK((in_character & RegExpMacroAssembler::kTableMask) == in_character);
427 if (total_samples_ < 1) return 1; // Division by zero.
428 int freq_in_per128 =
429 (frequencies_[in_character].counter() * 128) / total_samples_;
430 return freq_in_per128;
431 }
432
433 private:
435 public:
439
440 void Increment() { counter_++; }
441 int counter() { return counter_; }
442 int character() { return character_; }
443
444 private:
447 };
448
449 private:
452};
453
455 public:
456 RegExpCompiler(Isolate* isolate, Zone* zone, int capture_count,
457 RegExpFlags flags, bool is_one_byte);
458
461 reg_exp_too_big_ = true;
462 return next_register_;
463 }
464 return next_register_++;
465 }
466
467 // Lookarounds to match lone surrogates for unicode character class matches
468 // are never nested. We can therefore reuse registers.
475
482
483 struct CompilationResult final {
484 explicit CompilationResult(RegExpError err) : error(err) {}
487
489 return CompilationResult(RegExpError::kTooLarge);
490 }
491
492 bool Succeeded() const { return error == RegExpError::kNone; }
493
494 const RegExpError error = RegExpError::kNone;
497 };
498
500 RegExpNode* start, int capture_count,
502
503 // Preprocessing is the final step of node creation before analysis
504 // and assembly. It includes:
505 // - Wrapping the body of the regexp in capture 0.
506 // - Inserting the implicit .* before/after the regexp if necessary.
507 // - If the input is a one-byte string, filtering out nodes that can't match.
508 // - Fixing up regexp matches that start within a surrogate pair.
509 RegExpNode* PreprocessRegExp(RegExpCompileData* data, bool is_one_byte);
510
511 // If the regexp matching starts within a surrogate pair, step back to the
512 // lead surrogate and start matching from there.
514
515 inline void AddWork(RegExpNode* node) {
516 if (!node->on_work_list() && !node->label()->is_bound()) {
517 node->set_on_work_list(true);
518 work_list_->push_back(node);
519 }
520 }
521
522 static const int kImplementationOffset = 0;
523 static const int kNumberOfRegistersOffset = 0;
524 static const int kCodeOffset = 1;
525
527 EndNode* accept() { return accept_; }
528
529 static const int kMaxRecursion = 100;
530 inline int recursion_depth() { return recursion_depth_; }
533
534 inline RegExpFlags flags() const { return flags_; }
535 inline void set_flags(RegExpFlags flags) { flags_ = flags; }
536
538
539 inline bool one_byte() { return one_byte_; }
540 inline bool optimize() { return optimize_; }
541 inline void set_optimize(bool value) { optimize_ = value; }
542 inline bool limiting_recursion() { return limiting_recursion_; }
543 inline void set_limiting_recursion(bool value) {
545 }
546 bool read_backward() { return read_backward_; }
547 void set_read_backward(bool value) { read_backward_ = value; }
549
554
555 // The recursive nature of ToNode node generation means we may run into stack
556 // overflow issues. We introduce periodic checks to detect these, and the
557 // tick counter helps limit overhead of these checks.
558 // TODO(jgruber): This is super hacky and should be replaced by an abort
559 // mechanism or iterative node generation.
566
567 Isolate* isolate() const { return isolate_; }
568 Zone* zone() const { return zone_; }
569
570 static const int kNoRegister = -1;
571
572 private:
591};
592
593// Categorizes character ranges into BMP, non-BMP, lead, and trail surrogates.
618
619// We need to check for the following characters: 0x39C 0x3BC 0x178.
620// TODO(jgruber): Move to CharacterRange.
622
623} // namespace internal
624} // namespace v8
625
626#endif // V8_REGEXP_REGEXP_COMPILER_H_
void SetInterval(int map_number, const Interval &interval)
ZoneList< BoyerMoorePositionInfo * > * bitmaps_
int GetSkipTable(int min_lookahead, int max_lookahead, DirectHandle< ByteArray > boolean_skip_table, DirectHandle< ByteArray > nibble_table=DirectHandle< ByteArray >{})
BoyerMoorePositionInfo * at(int i)
BoyerMooreLookahead(int length, RegExpCompiler *compiler, Zone *zone)
bool FindWorthwhileInterval(int *from, int *to)
int FindBestInterval(int max_number_of_chars, int old_biggest_points, int *from, int *to)
void EmitSkipInstructions(RegExpMacroAssembler *masm)
void Set(int map_number, int character)
void SetInterval(const Interval &interval)
int Frequency(int in_character)
CharacterFrequency frequencies_[RegExpMacroAssembler::kTableSize]
void Merge(QuickCheckDetails *other, int from_index)
void set_characters(int characters)
Position * positions(int index)
void Advance(int by, bool one_byte)
void set_limiting_recursion(bool value)
static const int kImplementationOffset
FrequencyCollator frequency_collator_
static const int kNumberOfRegistersOffset
void set_flags(RegExpFlags flags)
ZoneVector< RegExpNode * > * work_list_
RegExpMacroAssembler * macro_assembler()
RegExpNode * OptionallyStepBackToLeadSurrogate(RegExpNode *on_success)
void set_current_expansion_factor(int value)
RegExpMacroAssembler * macro_assembler_
RegExpNode * PreprocessRegExp(RegExpCompileData *data, bool is_one_byte)
RegExpCompiler(Isolate *isolate, Zone *zone, int capture_count, RegExpFlags flags, bool is_one_byte)
FrequencyCollator * frequency_collator()
void AddWork(RegExpNode *node)
CompilationResult Assemble(Isolate *isolate, RegExpMacroAssembler *assembler, RegExpNode *start, int capture_count, DirectHandle< String > pattern)
DeferredAction(ActionNode::ActionType action_type, int reg)
ActionNode::ActionType action_type()
DeferredCapture(int reg, bool is_capture, Trace *trace)
QuickCheckDetails * quick_check_performed()
void PerformDeferredActions(RegExpMacroAssembler *macro, int max_register, const DynamicBitSet &affected_registers, DynamicBitSet *registers_to_pop, DynamicBitSet *registers_to_clear, Zone *zone)
void set_stop_node(RegExpNode *node)
void Flush(RegExpCompiler *compiler, RegExpNode *successor)
void set_at_start(TriBool at_start)
DeferredAction * actions_
RegExpNode * stop_node()
void AdvanceCurrentPositionInTrace(int by, RegExpCompiler *compiler)
void set_characters_preloaded(int count)
void add_action(DeferredAction *new_action)
int FindAffectedRegisters(DynamicBitSet *affected_registers, Zone *zone)
void set_quick_check_performed(QuickCheckDetails *d)
DeferredAction * actions()
void RestoreAffectedRegisters(RegExpMacroAssembler *macro, int max_register, const DynamicBitSet &registers_to_pop, const DynamicBitSet &registers_to_clear)
bool mentions_reg(int reg)
bool GetStoredPosition(int reg, int *cp_offset)
void set_backtrack(Label *backtrack)
void set_bound_checked_up_to(int to)
QuickCheckDetails quick_check_performed_
void set_loop_label(Label *label)
void set_flush_budget(int to)
const CharacterRangeVector * lead_surrogates() const
const CharacterRangeVector * bmp() const
V8_EXPORT_PRIVATE UnicodeRangeSplitter(ZoneList< CharacterRange > *base)
const CharacterRangeVector * trail_surrogates() const
const CharacterRangeVector * non_bmp() const
int start
uint32_t count
Label label
std::string pattern
Node * node
LiftoffRegister reg
Point to
RegListBase< RegisterT > registers
uint32_t uc32
Definition strings.h:19
constexpr bool IsEitherUnicode(RegExpFlags f)
ContainedInLattice Combine(ContainedInLattice a, ContainedInLattice b)
bool NeedsUnicodeCaseEquivalents(RegExpFlags flags)
BUILTIN_FP_CALL BUILTIN_FP_CALL BUILTIN_FP_CALL BUILTIN_FP_CALL BUILTIN_FP_CALL BUILTIN_FP_CALL BUILTIN_FP_CALL BUILTIN_FP_CALL BUILTIN_FP_CALL BUILTIN_FP_CALL int character
base::Flags< RegExpFlag > RegExpFlags
RegExpError AnalyzeRegExp(Isolate *isolate, bool is_one_byte, RegExpFlags flags, RegExpNode *node)
return value
Definition map-inl.h:893
static const base::uc32 kLeadSurrogateStart
bool RangeContainsLatin1Equivalents(CharacterRange range)
#define DCHECK_LE(v1, v2)
Definition logging.h:490
#define DCHECK(condition)
Definition logging.h:482
#define DCHECK_GT(v1, v2)
Definition logging.h:487
#define V8_EXPORT_PRIVATE
Definition macros.h:460
#define arraysize(array)
Definition macros.h:67
static const int kEatsAtLeastNotYetInitialized
CompilationResult(DirectHandle< Object > code, int registers)