v8
V8 is Google’s open source high-performance JavaScript and WebAssembly engine, written in C++.
Loading...
Searching...
No Matches
earley-parser.h
Go to the documentation of this file.
1// Copyright 2018 the V8 project authors. All rights reserved.
2// Use of this source code is governed by a BSD-style license that can be
3// found in the LICENSE file.
4
5#ifndef V8_TORQUE_EARLEY_PARSER_H_
6#define V8_TORQUE_EARLEY_PARSER_H_
7
8#include <map>
9#include <memory>
10#include <optional>
11#include <vector>
12
13#include "src/base/contextual.h"
15#include "src/torque/utils.h"
16
17namespace v8::internal::torque {
18
19class Symbol;
20class Item;
21
23 public:
24 enum class TypeId;
25 virtual ~ParseResultHolderBase() = default;
26 template <class T>
27 T& Cast();
28 template <class T>
29 const T& Cast() const;
30
31 protected:
32 explicit ParseResultHolderBase(TypeId type_id) : type_id_(type_id) {
33 // MSVC wrongly complains about type_id_ being an unused private field.
35 }
36
37 private:
39};
40
42 kStdString,
43 kBool,
44 kInt32,
45 kDouble,
46 kIntegerLiteral,
47 kStdVectorOfString,
48 kExpressionPtr,
49 kIdentifierPtr,
50 kOptionalIdentifierPtr,
51 kStatementPtr,
52 kDeclarationPtr,
53 kTypeExpressionPtr,
54 kOptionalTypeExpressionPtr,
55 kTryHandlerPtr,
56 kNameAndTypeExpression,
57 kEnumEntry,
58 kStdVectorOfEnumEntry,
59 kImplicitParameters,
60 kOptionalImplicitParameters,
61 kNameAndExpression,
62 kAnnotation,
63 kVectorOfAnnotation,
64 kAnnotationParameter,
65 kOptionalAnnotationParameter,
66 kClassFieldExpression,
67 kStructFieldExpression,
68 kBitFieldDeclaration,
69 kStdVectorOfNameAndTypeExpression,
70 kStdVectorOfNameAndExpression,
71 kStdVectorOfClassFieldExpression,
72 kStdVectorOfStructFieldExpression,
73 kStdVectorOfBitFieldDeclaration,
74 kIncrementDecrementOperator,
75 kOptionalStdString,
76 kStdVectorOfStatementPtr,
77 kStdVectorOfDeclarationPtr,
78 kStdVectorOfStdVectorOfDeclarationPtr,
79 kStdVectorOfExpressionPtr,
80 kExpressionWithSource,
81 kParameterList,
82 kTypeList,
83 kOptionalTypeList,
84 kLabelAndTypes,
85 kStdVectorOfLabelAndTypes,
86 kStdVectorOfTryHandlerPtr,
87 kOptionalStatementPtr,
88 kOptionalExpressionPtr,
89 kTypeswitchCase,
90 kStdVectorOfTypeswitchCase,
91 kStdVectorOfIdentifierPtr,
92 kOptionalClassBody,
93 kGenericParameter,
94 kGenericParameters,
95
96 kJsonValue,
97 kJsonMember,
98 kStdVectorOfJsonValue,
99 kStdVectorOfJsonMember,
100};
101
103
104template <class T>
106 public:
107 explicit ParseResultHolder(T value)
108 : ParseResultHolderBase(id), value_(std::move(value)) {}
109
110 private:
114};
115
116template <class T>
121
122template <class T>
125 return static_cast<const ParseResultHolder<T>*>(this)->value_;
126}
127
129 public:
130 template <class T>
131 explicit ParseResult(T x) : value_(new ParseResultHolder<T>(std::move(x))) {}
132
133 template <class T>
134 const T& Cast() const& {
135 return value_->Cast<T>();
136 }
137 template <class T>
138 T& Cast() & {
139 return value_->Cast<T>();
140 }
141 template <class T>
142 T&& Cast() && {
143 return std::move(value_->Cast<T>());
144 }
145
146 private:
147 std::unique_ptr<ParseResultHolderBase> value_;
148};
149
150using InputPosition = const char*;
151
160
162 public:
163 explicit ParseResultIterator(std::vector<ParseResult> results,
165 : results_(std::move(results)), matched_input_(matched_input) {}
166
169
171 CHECK_LT(i_, results_.size());
172 return std::move(results_[i_++]);
173 }
174 template <class T>
175 T NextAs() {
176 return std::move(Next().Cast<T>());
177 }
178 bool HasNext() const { return i_ < results_.size(); }
179
180 const MatchedInput& matched_input() const { return matched_input_; }
181
182 private:
183 std::vector<ParseResult> results_;
184 size_t i_ = 0;
186};
187
189 std::vector<Symbol*> token_symbols;
190 std::vector<MatchedInput> token_contents;
191};
192
193using Action =
194 std::optional<ParseResult> (*)(ParseResultIterator* child_results);
195
196inline std::optional<ParseResult> DefaultAction(
197 ParseResultIterator* child_results) {
198 if (!child_results->HasNext()) return std::nullopt;
199 return child_results->Next();
200}
201
202template <class T, Action action>
204 return [](ParseResultIterator* child_results) -> std::optional<ParseResult> {
205 auto result = action(child_results);
206 if (!result) return result;
207 return ParseResult{std::vector<T>{(*result).Cast<T>()}};
208 };
209}
210
211// A rule of the context-free grammar. Each rule can have an action attached to
212// it, which is executed after the parsing is finished.
213class Rule final {
214 public:
215 explicit Rule(std::vector<Symbol*> right_hand_side,
216 Action action = DefaultAction)
217 : right_hand_side_(std::move(right_hand_side)), action_(action) {}
218
219 Symbol* left() const {
221 return left_hand_side_;
222 }
223 const std::vector<Symbol*>& right() const { return right_hand_side_; }
224
225 void SetLeftHandSide(Symbol* left_hand_side) {
227 left_hand_side_ = left_hand_side;
228 }
229
230 V8_EXPORT_PRIVATE std::optional<ParseResult> RunAction(
231 const Item* completed_item, const LexerResult& tokens) const;
232
233 private:
235 std::vector<Symbol*> right_hand_side_;
237};
238
239// A Symbol represents a terminal or a non-terminal of the grammar.
240// It stores the list of rules, which have this symbol as the
241// left-hand side.
242// Terminals have an empty list of rules, they are created by the Lexer
243// instead of from rules.
244// Symbols need to reside at stable memory addresses, because the addresses are
245// used in the parser.
246class Symbol {
247 public:
248 Symbol() = default;
249 Symbol(std::initializer_list<Rule> rules) { *this = rules; }
250
251 // Disallow copying and moving to ensure Symbol has a stable address.
252 Symbol(const Symbol&) = delete;
253 Symbol& operator=(const Symbol&) = delete;
254
255 V8_EXPORT_PRIVATE Symbol& operator=(std::initializer_list<Rule> rules);
256
257 bool IsTerminal() const { return rules_.empty(); }
258 Rule* rule(size_t index) const { return rules_[index].get(); }
259 size_t rule_number() const { return rules_.size(); }
260
261 void AddRule(const Rule& rule) {
262 rules_.push_back(std::make_unique<Rule>(rule));
263 rules_.back()->SetLeftHandSide(this);
264 }
265
266 V8_EXPORT_PRIVATE std::optional<ParseResult> RunAction(
267 const Item* item, const LexerResult& tokens);
268
269 private:
270 std::vector<std::unique_ptr<Rule>> rules_;
271};
272
273// Items are the core datastructure of Earley's algorithm.
274// They consist of a (partially) matched rule, a marked position inside of the
275// right-hand side of the rule (traditionally written as a dot) and an input
276// range from {start} to {pos} that matches the symbols of the right-hand side
277// that are left of the mark. In addition, they store a child and a left-sibling
278// pointer to reconstruct the AST in the end.
279class Item {
280 public:
281 Item(const Rule* rule, size_t mark, size_t start, size_t pos)
282 : rule_(rule), mark_(mark), start_(start), pos_(pos) {
284 }
285
286 // A complete item has the mark at the right end, which means the input range
287 // matches the complete rule.
288 bool IsComplete() const {
290 return mark_ == right().size();
291 }
292
293 // The symbol right after the mark is expected at {pos} for this item to
294 // advance.
296 DCHECK(!IsComplete());
298 return right()[mark_];
299 }
300
301 // We successfully parsed NextSymbol() between {pos} and {new_pos}.
302 // If NextSymbol() was a non-terminal, then {child} is a pointer to a
303 // completed item for this parse.
304 // We create a new item, which moves the mark one forward.
305 Item Advance(size_t new_pos, const Item* child = nullptr) const {
306 if (child) {
307 DCHECK(child->IsComplete());
308 DCHECK_EQ(pos(), child->start());
309 DCHECK_EQ(new_pos, child->pos());
310 DCHECK_EQ(NextSymbol(), child->left());
311 }
312 Item result(rule_, mark_ + 1, start_, new_pos);
313 result.prev_ = this;
314 result.child_ = child;
315 return result;
316 }
317
318 // Collect the items representing the AST children of this completed item.
319 std::vector<const Item*> Children() const;
320 // The matched input separated according to the next branching AST level.
321 std::string SplitByChildren(const LexerResult& tokens) const;
322 // Check if {other} results in the same AST as this Item.
323 void CheckAmbiguity(const Item& other, const LexerResult& tokens) const;
324
326 const MatchedInput& start = tokens.token_contents[start_];
327 const MatchedInput& end = start_ == pos_ ? tokens.token_contents[start_]
328 : tokens.token_contents[pos_ - 1];
329 CHECK_EQ(start.pos.source, end.pos.source);
330 SourcePosition combined{start.pos.source, start.pos.start, end.pos.end};
331
332 return {start.begin, end.end, combined};
333 }
334
335 // We exclude {prev_} and {child_} from equality and hash computations,
336 // because they are just globally unique data associated with an item.
337 bool operator==(const Item& other) const {
338 return rule_ == other.rule_ && mark_ == other.mark_ &&
339 start_ == other.start_ && pos_ == other.pos_;
340 }
341
342 friend size_t hash_value(const Item& i) {
343 return base::hash_combine(i.rule_, i.mark_, i.start_, i.pos_);
344 }
345
346 const Rule* rule() const { return rule_; }
347 Symbol* left() const { return rule_->left(); }
348 const std::vector<Symbol*>& right() const { return rule_->right(); }
349 size_t pos() const { return pos_; }
350 size_t start() const { return start_; }
351
352 private:
353 const Rule* rule_;
354 size_t mark_;
355 size_t start_;
356 size_t pos_;
357
358 const Item* prev_ = nullptr;
359 const Item* child_ = nullptr;
360};
361
362inline std::optional<ParseResult> Symbol::RunAction(const Item* item,
363 const LexerResult& tokens) {
364 DCHECK(item->IsComplete());
365 DCHECK_EQ(item->left(), this);
366 return item->rule()->RunAction(item, tokens);
367}
368
370 Symbol* start, const LexerResult& tokens,
371 std::unordered_set<Item, base::hash<Item>>* processed);
372
373inline std::optional<ParseResult> ParseTokens(Symbol* start,
374 const LexerResult& tokens) {
375 std::unordered_set<Item, base::hash<Item>> table;
376 const Item* final_item = RunEarleyAlgorithm(start, tokens, &table);
377 return start->RunAction(final_item, tokens);
378}
379
380// The lexical syntax is dynamically defined while building the grammar by
381// adding patterns and keywords to the Lexer.
382// The term keyword here can stand for any fixed character sequence, including
383// operators and parentheses.
384// Each pattern or keyword automatically gets a terminal symbol associated with
385// it. These symbols form the result of the lexing.
386// Patterns and keywords are matched using the longest match principle. If the
387// longest matching pattern coincides with a keyword, the keyword symbol is
388// chosen instead of the pattern.
389// In addition, there is a single whitespace pattern which is consumed but does
390// not become part of the token list.
391class Lexer {
392 public:
393 // Functions to define patterns. They try to match starting from {pos}. If
394 // successful, they return true and advance {pos}. Otherwise, {pos} stays
395 // unchanged.
397
399 match_whitespace_ = whitespace;
400 }
401
403 Symbol* Token(const std::string& keyword) { return &keywords_[keyword]; }
404 V8_EXPORT_PRIVATE LexerResult RunLexer(const std::string& input);
405
406 private:
408 std::map<PatternFunction, Symbol> patterns_;
409 std::map<std::string, Symbol> keywords_;
411};
412
413// A grammar can have a result, which is the results of the start symbol.
414// Grammar is intended to be subclassed, with Symbol members forming the
415// mutually recursive rules of the grammar.
416class Grammar {
417 public:
419
420 explicit Grammar(Symbol* start) : start_(start) {}
421
422 std::optional<ParseResult> Parse(const std::string& input) {
423 LexerResult tokens = lexer().RunLexer(input);
424 return ParseTokens(start_, tokens);
425 }
426
427 protected:
428 Symbol* Token(const std::string& s) { return lexer_.Token(s); }
431
432 // NewSymbol() allocates a fresh symbol and stores it in the current grammar.
433 // This is necessary to define helpers that create new symbols.
434 Symbol* NewSymbol(std::initializer_list<Rule> rules = {}) {
435 auto symbol = std::make_unique<Symbol>(rules);
436 Symbol* result = symbol.get();
437 generated_symbols_.push_back(std::move(symbol));
438 return result;
439 }
440
441 // Helper functions to define lexer patterns. If they match, they return true
442 // and advance {pos}. Otherwise, {pos} is unchanged.
443 V8_EXPORT_PRIVATE static bool MatchChar(int (*char_class)(int),
445 V8_EXPORT_PRIVATE static bool MatchChar(bool (*char_class)(char),
448 V8_EXPORT_PRIVATE static bool MatchString(const char* s, InputPosition* pos);
449
450 // The action MatchInput() produces the input matched by the rule as
451 // result.
452 static std::optional<ParseResult> YieldMatchedInput(
453 ParseResultIterator* child_results) {
454 return ParseResult{child_results->matched_input().ToString()};
455 }
456
457 // Create a new symbol to parse the given sequence of symbols.
458 // At most one of the symbols can return a result.
459 Symbol* Sequence(std::vector<Symbol*> symbols) {
460 return NewSymbol({Rule(std::move(symbols))});
461 }
462
463 template <class T, T value>
464 static std::optional<ParseResult> YieldIntegralConstant(
465 ParseResultIterator* child_results) {
466 return ParseResult{value};
467 }
468
469 template <class T>
470 static std::optional<ParseResult> YieldDefaultValue(
471 ParseResultIterator* child_results) {
472 return ParseResult{T{}};
473 }
474
475 template <class From, class To>
476 static std::optional<ParseResult> CastParseResult(
477 ParseResultIterator* child_results) {
478 To result = child_results->NextAs<From>();
479 return ParseResult{std::move(result)};
480 }
481
482 // Try to parse {s} and return the result of type {Result} casted to {T}.
483 // Otherwise, the result is a default-constructed {T}.
484 template <class T, class Result = T>
489
490 template <class T>
491 static std::optional<ParseResult> MakeSingletonVector(
492 ParseResultIterator* child_results) {
493 T x = child_results->NextAs<T>();
494 std::vector<T> result;
495 result.push_back(std::move(x));
496 return ParseResult{std::move(result)};
497 }
498
499 template <class T>
500 static std::optional<ParseResult> MakeExtendedVector(
501 ParseResultIterator* child_results) {
502 std::vector<T> l = child_results->NextAs<std::vector<T>>();
503 T x = child_results->NextAs<T>();
504 l.push_back(std::move(x));
505 return ParseResult{std::move(l)};
506 }
507
508 // For example, NonemptyList(Token("A"), Token(",")) parses any of
509 // A or A,A or A,A,A and so on.
510 template <class T>
511 Symbol* NonemptyList(Symbol* element, std::optional<Symbol*> separator = {}) {
512 Symbol* list = NewSymbol();
513 *list = {Rule({element}, MakeSingletonVector<T>),
515 ? Rule({list, *separator, element}, MakeExtendedVector<T>)
516 : Rule({list, element}, MakeExtendedVector<T>)};
517 return list;
518 }
519
520 template <class T>
521 Symbol* List(Symbol* element, std::optional<Symbol*> separator = {}) {
523 }
524
525 template <class T>
529
534
535 Lexer& lexer() { return lexer_; }
536
537 private:
539 std::vector<std::unique_ptr<Symbol>> generated_symbols_;
541};
542
543} // namespace v8::internal::torque
544
545#endif // V8_TORQUE_EARLEY_PARSER_H_
SourcePosition pos
Lexer::PatternFunction PatternFunction
std::vector< std::unique_ptr< Symbol > > generated_symbols_
static std::optional< ParseResult > MakeSingletonVector(ParseResultIterator *child_results)
void SetWhitespace(PatternFunction ws)
Symbol * Pattern(PatternFunction pattern)
Symbol * CheckIf(Symbol *x)
static std::optional< ParseResult > CastParseResult(ParseResultIterator *child_results)
Symbol * TryOrDefault(Symbol *s)
Symbol * NonemptyList(Symbol *element, std::optional< Symbol * > separator={})
static std::optional< ParseResult > YieldDefaultValue(ParseResultIterator *child_results)
Symbol * Sequence(std::vector< Symbol * > symbols)
static std::optional< ParseResult > YieldMatchedInput(ParseResultIterator *child_results)
static std::optional< ParseResult > MakeExtendedVector(ParseResultIterator *child_results)
std::optional< ParseResult > Parse(const std::string &input)
Symbol * List(Symbol *element, std::optional< Symbol * > separator={})
static V8_EXPORT_PRIVATE bool MatchAnyChar(InputPosition *pos)
Symbol * Optional(Symbol *x)
Symbol * Token(const std::string &s)
static std::optional< ParseResult > YieldIntegralConstant(ParseResultIterator *child_results)
static V8_EXPORT_PRIVATE bool MatchString(const char *s, InputPosition *pos)
static V8_EXPORT_PRIVATE bool MatchChar(int(*char_class)(int), InputPosition *pos)
Symbol * NewSymbol(std::initializer_list< Rule > rules={})
friend size_t hash_value(const Item &i)
std::string SplitByChildren(const LexerResult &tokens) const
const Rule * rule() const
bool operator==(const Item &other) const
Symbol * NextSymbol() const
Item Advance(size_t new_pos, const Item *child=nullptr) const
Item(const Rule *rule, size_t mark, size_t start, size_t pos)
const std::vector< Symbol * > & right() const
void CheckAmbiguity(const Item &other, const LexerResult &tokens) const
std::vector< const Item * > Children() const
MatchedInput GetMatchedInput(const LexerResult &tokens) const
bool(*)(InputPosition *pos) PatternFunction
Symbol * MatchToken(InputPosition *pos, InputPosition end)
std::map< PatternFunction, Symbol > patterns_
Symbol * Token(const std::string &keyword)
void SetWhitespace(PatternFunction whitespace)
V8_EXPORT_PRIVATE LexerResult RunLexer(const std::string &input)
PatternFunction match_whitespace_
Symbol * Pattern(PatternFunction pattern)
std::map< std::string, Symbol > keywords_
static V8_EXPORT_PRIVATE const TypeId id
ParseResultIterator(std::vector< ParseResult > results, MatchedInput matched_input)
const MatchedInput & matched_input() const
ParseResultIterator & operator=(const ParseResultIterator &)=delete
ParseResultIterator(const ParseResultIterator &)=delete
std::unique_ptr< ParseResultHolderBase > value_
void SetLeftHandSide(Symbol *left_hand_side)
Rule(std::vector< Symbol * > right_hand_side, Action action=DefaultAction)
std::vector< Symbol * > right_hand_side_
V8_EXPORT_PRIVATE std::optional< ParseResult > RunAction(const Item *completed_item, const LexerResult &tokens) const
const std::vector< Symbol * > & right() const
Symbol & operator=(const Symbol &)=delete
V8_EXPORT_PRIVATE std::optional< ParseResult > RunAction(const Item *item, const LexerResult &tokens)
Rule * rule(size_t index) const
Symbol(const Symbol &)=delete
std::vector< std::unique_ptr< Rule > > rules_
Symbol(std::initializer_list< Rule > rules)
void AddRule(const Rule &rule)
Register const value_
int start
int end
std::string pattern
ZoneVector< RpoNumber > & result
int x
STL namespace.
V8_INLINE size_t hash_combine(size_t seed, size_t hash)
Definition hashing.h:77
std::optional< ParseResult > ParseTokens(Symbol *start, const LexerResult &tokens)
std::optional< ParseResult > DefaultAction(ParseResultIterator *child_results)
const char * InputPosition
std::optional< ParseResult >(*)(ParseResultIterator *child_results) Action
const Item * RunEarleyAlgorithm(Symbol *start, const LexerResult &tokens, std::unordered_set< Item, base::hash< Item > > *processed)
Tagged< To > Cast(Tagged< From > value, const v8::SourceLocation &loc=INIT_SOURCE_LOCATION_IN_DEBUG)
Definition casting.h:150
#define DCHECK_LE(v1, v2)
Definition logging.h:490
#define DCHECK_NULL(val)
Definition logging.h:491
#define CHECK_LT(lhs, rhs)
#define DCHECK_NOT_NULL(val)
Definition logging.h:492
#define CHECK_EQ(lhs, rhs)
#define DCHECK(condition)
Definition logging.h:482
#define DCHECK_LT(v1, v2)
Definition logging.h:489
#define DCHECK_EQ(v1, v2)
Definition logging.h:485
#define USE(...)
Definition macros.h:293
#define V8_EXPORT_PRIVATE
Definition macros.h:460
std::vector< MatchedInput > token_contents
std::vector< Symbol * > token_symbols
MatchedInput(InputPosition begin, InputPosition end, SourcePosition pos)