• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1 // Copyright 2018 the V8 project authors. All rights reserved.
2 // Use of this source code is governed by a BSD-style license that can be
3 // found in the LICENSE file.
4 
5 #ifndef V8_TORQUE_EARLEY_PARSER_H_
6 #define V8_TORQUE_EARLEY_PARSER_H_
7 
8 #include <map>
9 #include <memory>
10 #include <vector>
11 
12 #include "src/base/optional.h"
13 #include "src/torque/contextual.h"
14 #include "src/torque/source-positions.h"
15 #include "src/torque/utils.h"
16 
17 namespace v8 {
18 namespace internal {
19 namespace torque {
20 
21 class Symbol;
22 class Item;
23 
24 class ParseResultHolderBase {
25  public:
26   enum class TypeId;
27   virtual ~ParseResultHolderBase() = default;
28   template <class T>
29   T& Cast();
30   template <class T>
31   const T& Cast() const;
32 
33  protected:
ParseResultHolderBase(TypeId type_id)34   explicit ParseResultHolderBase(TypeId type_id) : type_id_(type_id) {
35     // MSVC wrongly complains about type_id_ being an unused private field.
36     USE(type_id_);
37   }
38 
39  private:
40   const TypeId type_id_;
41 };
42 
43 enum class ParseResultHolderBase::TypeId {
44   kStdString,
45   kBool,
46   kInt32,
47   kStdVectorOfString,
48   kExpressionPtr,
49   kIdentifierPtr,
50   kOptionalIdentifierPtr,
51   kStatementPtr,
52   kDeclarationPtr,
53   kTypeExpressionPtr,
54   kOptionalTypeExpressionPtr,
55   kTryHandlerPtr,
56   kNameAndTypeExpression,
57   kEnumEntry,
58   kStdVectorOfEnumEntry,
59   kImplicitParameters,
60   kOptionalImplicitParameters,
61   kNameAndExpression,
62   kAnnotation,
63   kVectorOfAnnotation,
64   kAnnotationParameter,
65   kOptionalAnnotationParameter,
66   kClassFieldExpression,
67   kStructFieldExpression,
68   kBitFieldDeclaration,
69   kStdVectorOfNameAndTypeExpression,
70   kStdVectorOfNameAndExpression,
71   kStdVectorOfClassFieldExpression,
72   kStdVectorOfStructFieldExpression,
73   kStdVectorOfBitFieldDeclaration,
74   kIncrementDecrementOperator,
75   kOptionalStdString,
76   kStdVectorOfStatementPtr,
77   kStdVectorOfDeclarationPtr,
78   kStdVectorOfStdVectorOfDeclarationPtr,
79   kStdVectorOfExpressionPtr,
80   kExpressionWithSource,
81   kParameterList,
82   kTypeList,
83   kOptionalTypeList,
84   kLabelAndTypes,
85   kStdVectorOfLabelAndTypes,
86   kStdVectorOfTryHandlerPtr,
87   kOptionalStatementPtr,
88   kOptionalExpressionPtr,
89   kTypeswitchCase,
90   kStdVectorOfTypeswitchCase,
91   kStdVectorOfIdentifierPtr,
92   kOptionalClassBody,
93   kGenericParameter,
94   kGenericParameters,
95 
96   kJsonValue,
97   kJsonMember,
98   kStdVectorOfJsonValue,
99   kStdVectorOfJsonMember,
100 };
101 
102 using ParseResultTypeId = ParseResultHolderBase::TypeId;
103 
104 template <class T>
105 class ParseResultHolder : public ParseResultHolderBase {
106  public:
ParseResultHolder(T value)107   explicit ParseResultHolder(T value)
108       : ParseResultHolderBase(id), value_(std::move(value)) {}
109 
110  private:
111   V8_EXPORT_PRIVATE static const TypeId id;
112   friend class ParseResultHolderBase;
113   T value_;
114 };
115 
116 template <class T>
Cast()117 T& ParseResultHolderBase::Cast() {
118   CHECK_EQ(ParseResultHolder<T>::id, type_id_);
119   return static_cast<ParseResultHolder<T>*>(this)->value_;
120 }
121 
122 template <class T>
Cast()123 const T& ParseResultHolderBase::Cast() const {
124   CHECK_EQ(ParseResultHolder<T>::id, type_id_);
125   return static_cast<const ParseResultHolder<T>*>(this)->value_;
126 }
127 
128 class ParseResult {
129  public:
130   template <class T>
ParseResult(T x)131   explicit ParseResult(T x) : value_(new ParseResultHolder<T>(std::move(x))) {}
132 
133   template <class T>
Cast()134   const T& Cast() const& {
135     return value_->Cast<T>();
136   }
137   template <class T>
Cast()138   T& Cast() & {
139     return value_->Cast<T>();
140   }
141   template <class T>
Cast()142   T&& Cast() && {
143     return std::move(value_->Cast<T>());
144   }
145 
146  private:
147   std::unique_ptr<ParseResultHolderBase> value_;
148 };
149 
150 using InputPosition = const char*;
151 
152 struct MatchedInput {
MatchedInputMatchedInput153   MatchedInput(InputPosition begin, InputPosition end, SourcePosition pos)
154       : begin(begin), end(end), pos(pos) {}
155   InputPosition begin;
156   InputPosition end;
157   SourcePosition pos;
ToStringMatchedInput158   std::string ToString() const { return {begin, end}; }
159 };
160 
161 class ParseResultIterator {
162  public:
ParseResultIterator(std::vector<ParseResult> results,MatchedInput matched_input)163   explicit ParseResultIterator(std::vector<ParseResult> results,
164                                MatchedInput matched_input)
165       : results_(std::move(results)), matched_input_(matched_input) {}
~ParseResultIterator()166   ~ParseResultIterator() {
167     // Check that all parse results have been used.
168     CHECK_EQ(results_.size(), i_);
169   }
170 
Next()171   ParseResult Next() {
172     CHECK_LT(i_, results_.size());
173     return std::move(results_[i_++]);
174   }
175   template <class T>
NextAs()176   T NextAs() {
177     return std::move(Next().Cast<T>());
178   }
HasNext()179   bool HasNext() const { return i_ < results_.size(); }
180 
matched_input()181   const MatchedInput& matched_input() const { return matched_input_; }
182 
183  private:
184   std::vector<ParseResult> results_;
185   size_t i_ = 0;
186   MatchedInput matched_input_;
187 
188   DISALLOW_COPY_AND_ASSIGN(ParseResultIterator);
189 };
190 
191 struct LexerResult {
192   std::vector<Symbol*> token_symbols;
193   std::vector<MatchedInput> token_contents;
194 };
195 
196 using Action =
197     base::Optional<ParseResult> (*)(ParseResultIterator* child_results);
198 
DefaultAction(ParseResultIterator * child_results)199 inline base::Optional<ParseResult> DefaultAction(
200     ParseResultIterator* child_results) {
201   if (!child_results->HasNext()) return base::nullopt;
202   return child_results->Next();
203 }
204 
205 template <class T, Action action>
AsSingletonVector()206 inline Action AsSingletonVector() {
207   return [](ParseResultIterator* child_results) -> base::Optional<ParseResult> {
208     auto result = action(child_results);
209     if (!result) return result;
210     return ParseResult{std::vector<T>{(*result).Cast<T>()}};
211   };
212 }
213 
214 // A rule of the context-free grammar. Each rule can have an action attached to
215 // it, which is executed after the parsing is finished.
216 class Rule final {
217  public:
218   explicit Rule(std::vector<Symbol*> right_hand_side,
219                 Action action = DefaultAction)
right_hand_side_(std::move (right_hand_side))220       : right_hand_side_(std::move(right_hand_side)), action_(action) {}
221 
left()222   Symbol* left() const {
223     DCHECK_NOT_NULL(left_hand_side_);
224     return left_hand_side_;
225   }
right()226   const std::vector<Symbol*>& right() const { return right_hand_side_; }
227 
SetLeftHandSide(Symbol * left_hand_side)228   void SetLeftHandSide(Symbol* left_hand_side) {
229     DCHECK_NULL(left_hand_side_);
230     left_hand_side_ = left_hand_side;
231   }
232 
233   V8_EXPORT_PRIVATE base::Optional<ParseResult> RunAction(
234       const Item* completed_item, const LexerResult& tokens) const;
235 
236  private:
237   Symbol* left_hand_side_ = nullptr;
238   std::vector<Symbol*> right_hand_side_;
239   Action action_;
240 };
241 
242 // A Symbol represents a terminal or a non-terminal of the grammar.
243 // It stores the list of rules, which have this symbol as the
244 // left-hand side.
245 // Terminals have an empty list of rules, they are created by the Lexer
246 // instead of from rules.
247 // Symbols need to reside at stable memory addresses, because the addresses are
248 // used in the parser.
249 class Symbol {
250  public:
Symbol()251   Symbol() : Symbol({}) {}
Symbol(std::initializer_list<Rule> rules)252   Symbol(std::initializer_list<Rule> rules) { *this = rules; }
253 
254   V8_EXPORT_PRIVATE Symbol& operator=(std::initializer_list<Rule> rules);
255 
IsTerminal()256   bool IsTerminal() const { return rules_.empty(); }
rule(size_t index)257   Rule* rule(size_t index) const { return rules_[index].get(); }
rule_number()258   size_t rule_number() const { return rules_.size(); }
259 
AddRule(const Rule & rule)260   void AddRule(const Rule& rule) {
261     rules_.push_back(std::make_unique<Rule>(rule));
262     rules_.back()->SetLeftHandSide(this);
263   }
264 
265   V8_EXPORT_PRIVATE base::Optional<ParseResult> RunAction(
266       const Item* item, const LexerResult& tokens);
267 
268  private:
269   std::vector<std::unique_ptr<Rule>> rules_;
270 
271   // Disallow copying and moving to ensure Symbol has a stable address.
272   DISALLOW_COPY_AND_ASSIGN(Symbol);
273 };
274 
275 // Items are the core datastructure of Earley's algorithm.
276 // They consist of a (partially) matched rule, a marked position inside of the
277 // right-hand side of the rule (traditionally written as a dot) and an input
278 // range from {start} to {pos} that matches the symbols of the right-hand side
279 // that are left of the mark. In addition, they store a child and a left-sibling
280 // pointer to reconstruct the AST in the end.
281 class Item {
282  public:
Item(const Rule * rule,size_t mark,size_t start,size_t pos)283   Item(const Rule* rule, size_t mark, size_t start, size_t pos)
284       : rule_(rule), mark_(mark), start_(start), pos_(pos) {
285     DCHECK_LE(mark_, right().size());
286   }
287 
288   // A complete item has the mark at the right end, which means the input range
289   // matches the complete rule.
IsComplete()290   bool IsComplete() const {
291     DCHECK_LE(mark_, right().size());
292     return mark_ == right().size();
293   }
294 
295   // The symbol right after the mark is expected at {pos} for this item to
296   // advance.
NextSymbol()297   Symbol* NextSymbol() const {
298     DCHECK(!IsComplete());
299     DCHECK_LT(mark_, right().size());
300     return right()[mark_];
301   }
302 
303   // We successfully parsed NextSymbol() between {pos} and {new_pos}.
304   // If NextSymbol() was a non-terminal, then {child} is a pointer to a
305   // completed item for this parse.
306   // We create a new item, which moves the mark one forward.
307   Item Advance(size_t new_pos, const Item* child = nullptr) const {
308     if (child) {
309       DCHECK(child->IsComplete());
310       DCHECK_EQ(pos(), child->start());
311       DCHECK_EQ(new_pos, child->pos());
312       DCHECK_EQ(NextSymbol(), child->left());
313     }
314     Item result(rule_, mark_ + 1, start_, new_pos);
315     result.prev_ = this;
316     result.child_ = child;
317     return result;
318   }
319 
320   // Collect the items representing the AST children of this completed item.
321   std::vector<const Item*> Children() const;
322   // The matched input separated according to the next branching AST level.
323   std::string SplitByChildren(const LexerResult& tokens) const;
324   // Check if {other} results in the same AST as this Item.
325   void CheckAmbiguity(const Item& other, const LexerResult& tokens) const;
326 
GetMatchedInput(const LexerResult & tokens)327   MatchedInput GetMatchedInput(const LexerResult& tokens) const {
328     const MatchedInput& start = tokens.token_contents[start_];
329     const MatchedInput& end = start_ == pos_ ? tokens.token_contents[start_]
330                                              : tokens.token_contents[pos_ - 1];
331     CHECK_EQ(start.pos.source, end.pos.source);
332     SourcePosition combined{start.pos.source, start.pos.start, end.pos.end};
333 
334     return {start.begin, end.end, combined};
335   }
336 
337   // We exclude {prev_} and {child_} from equality and hash computations,
338   // because they are just globally unique data associated with an item.
339   bool operator==(const Item& other) const {
340     return rule_ == other.rule_ && mark_ == other.mark_ &&
341            start_ == other.start_ && pos_ == other.pos_;
342   }
343 
hash_value(const Item & i)344   friend size_t hash_value(const Item& i) {
345     return base::hash_combine(i.rule_, i.mark_, i.start_, i.pos_);
346   }
347 
rule()348   const Rule* rule() const { return rule_; }
left()349   Symbol* left() const { return rule_->left(); }
right()350   const std::vector<Symbol*>& right() const { return rule_->right(); }
pos()351   size_t pos() const { return pos_; }
start()352   size_t start() const { return start_; }
353 
354  private:
355   const Rule* rule_;
356   size_t mark_;
357   size_t start_;
358   size_t pos_;
359 
360   const Item* prev_ = nullptr;
361   const Item* child_ = nullptr;
362 };
363 
RunAction(const Item * item,const LexerResult & tokens)364 inline base::Optional<ParseResult> Symbol::RunAction(
365     const Item* item, const LexerResult& tokens) {
366   DCHECK(item->IsComplete());
367   DCHECK_EQ(item->left(), this);
368   return item->rule()->RunAction(item, tokens);
369 }
370 
371 V8_EXPORT_PRIVATE const Item* RunEarleyAlgorithm(
372     Symbol* start, const LexerResult& tokens,
373     std::unordered_set<Item, base::hash<Item>>* processed);
374 
ParseTokens(Symbol * start,const LexerResult & tokens)375 inline base::Optional<ParseResult> ParseTokens(Symbol* start,
376                                                const LexerResult& tokens) {
377   std::unordered_set<Item, base::hash<Item>> table;
378   const Item* final_item = RunEarleyAlgorithm(start, tokens, &table);
379   return start->RunAction(final_item, tokens);
380 }
381 
382 // The lexical syntax is dynamically defined while building the grammar by
383 // adding patterns and keywords to the Lexer.
384 // The term keyword here can stand for any fixed character sequence, including
385 // operators and parentheses.
386 // Each pattern or keyword automatically gets a terminal symbol associated with
387 // it. These symbols form the result of the lexing.
388 // Patterns and keywords are matched using the longest match principle. If the
389 // longest matching pattern coincides with a keyword, the keyword symbol is
390 // chosen instead of the pattern.
391 // In addition, there is a single whitespace pattern which is consumed but does
392 // not become part of the token list.
393 class Lexer {
394  public:
395   // Functions to define patterns. They try to match starting from {pos}. If
396   // successful, they return true and advance {pos}. Otherwise, {pos} stays
397   // unchanged.
398   using PatternFunction = bool (*)(InputPosition* pos);
399 
SetWhitespace(PatternFunction whitespace)400   void SetWhitespace(PatternFunction whitespace) {
401     match_whitespace_ = whitespace;
402   }
403 
Pattern(PatternFunction pattern)404   Symbol* Pattern(PatternFunction pattern) { return &patterns_[pattern]; }
Token(const std::string & keyword)405   Symbol* Token(const std::string& keyword) { return &keywords_[keyword]; }
406   V8_EXPORT_PRIVATE LexerResult RunLexer(const std::string& input);
407 
408  private:
409   PatternFunction match_whitespace_ = [](InputPosition*) { return false; };
410   std::map<PatternFunction, Symbol> patterns_;
411   std::map<std::string, Symbol> keywords_;
412   Symbol* MatchToken(InputPosition* pos, InputPosition end);
413 };
414 
415 // A grammar can have a result, which is the results of the start symbol.
416 // Grammar is intended to be subclassed, with Symbol members forming the
417 // mutually recursive rules of the grammar.
418 class Grammar {
419  public:
420   using PatternFunction = Lexer::PatternFunction;
421 
Grammar(Symbol * start)422   explicit Grammar(Symbol* start) : start_(start) {}
423 
Parse(const std::string & input)424   base::Optional<ParseResult> Parse(const std::string& input) {
425     LexerResult tokens = lexer().RunLexer(input);
426     return ParseTokens(start_, tokens);
427   }
428 
429  protected:
Token(const std::string & s)430   Symbol* Token(const std::string& s) { return lexer_.Token(s); }
Pattern(PatternFunction pattern)431   Symbol* Pattern(PatternFunction pattern) { return lexer_.Pattern(pattern); }
SetWhitespace(PatternFunction ws)432   void SetWhitespace(PatternFunction ws) { lexer_.SetWhitespace(ws); }
433 
434   // NewSymbol() allocates a fresh symbol and stores it in the current grammar.
435   // This is necessary to define helpers that create new symbols.
436   Symbol* NewSymbol(std::initializer_list<Rule> rules = {}) {
437     auto symbol = std::make_unique<Symbol>(rules);
438     Symbol* result = symbol.get();
439     generated_symbols_.push_back(std::move(symbol));
440     return result;
441   }
442 
443   // Helper functions to define lexer patterns. If they match, they return true
444   // and advance {pos}. Otherwise, {pos} is unchanged.
445   V8_EXPORT_PRIVATE static bool MatchChar(int (*char_class)(int),
446                                           InputPosition* pos);
447   V8_EXPORT_PRIVATE static bool MatchChar(bool (*char_class)(char),
448                                           InputPosition* pos);
449   V8_EXPORT_PRIVATE static bool MatchAnyChar(InputPosition* pos);
450   V8_EXPORT_PRIVATE static bool MatchString(const char* s, InputPosition* pos);
451 
452   // The action MatchInput() produces the input matched by the rule as
453   // result.
YieldMatchedInput(ParseResultIterator * child_results)454   static base::Optional<ParseResult> YieldMatchedInput(
455       ParseResultIterator* child_results) {
456     return ParseResult{child_results->matched_input().ToString()};
457   }
458 
459   // Create a new symbol to parse the given sequence of symbols.
460   // At most one of the symbols can return a result.
Sequence(std::vector<Symbol * > symbols)461   Symbol* Sequence(std::vector<Symbol*> symbols) {
462     return NewSymbol({Rule(std::move(symbols))});
463   }
464 
465   template <class T, T value>
YieldIntegralConstant(ParseResultIterator * child_results)466   static base::Optional<ParseResult> YieldIntegralConstant(
467       ParseResultIterator* child_results) {
468     return ParseResult{value};
469   }
470 
471   template <class T>
YieldDefaultValue(ParseResultIterator * child_results)472   static base::Optional<ParseResult> YieldDefaultValue(
473       ParseResultIterator* child_results) {
474     return ParseResult{T{}};
475   }
476 
477   template <class From, class To>
CastParseResult(ParseResultIterator * child_results)478   static base::Optional<ParseResult> CastParseResult(
479       ParseResultIterator* child_results) {
480     To result = std::move(child_results->NextAs<From>());
481     return ParseResult{std::move(result)};
482   }
483 
484   // Try to parse {s} and return the result of type {Result} casted to {T}.
485   // Otherwise, the result is a default-constructed {T}.
486   template <class T, class Result = T>
TryOrDefault(Symbol * s)487   Symbol* TryOrDefault(Symbol* s) {
488     return NewSymbol({Rule({s}, CastParseResult<Result, T>),
489                       Rule({}, YieldDefaultValue<T>)});
490   }
491 
492   template <class T>
MakeSingletonVector(ParseResultIterator * child_results)493   static base::Optional<ParseResult> MakeSingletonVector(
494       ParseResultIterator* child_results) {
495     T x = child_results->NextAs<T>();
496     std::vector<T> result;
497     result.push_back(std::move(x));
498     return ParseResult{std::move(result)};
499   }
500 
501   template <class T>
MakeExtendedVector(ParseResultIterator * child_results)502   static base::Optional<ParseResult> MakeExtendedVector(
503       ParseResultIterator* child_results) {
504     std::vector<T> l = child_results->NextAs<std::vector<T>>();
505     T x = child_results->NextAs<T>();
506     l.push_back(std::move(x));
507     return ParseResult{std::move(l)};
508   }
509 
510   // For example, NonemptyList(Token("A"), Token(",")) parses any of
511   // A or A,A or A,A,A and so on.
512   template <class T>
513   Symbol* NonemptyList(Symbol* element,
514                        base::Optional<Symbol*> separator = {}) {
515     Symbol* list = NewSymbol();
516     *list = {Rule({element}, MakeSingletonVector<T>),
517              separator
518                  ? Rule({list, *separator, element}, MakeExtendedVector<T>)
519                  : Rule({list, element}, MakeExtendedVector<T>)};
520     return list;
521   }
522 
523   template <class T>
524   Symbol* List(Symbol* element, base::Optional<Symbol*> separator = {}) {
525     return TryOrDefault<std::vector<T>>(NonemptyList<T>(element, separator));
526   }
527 
528   template <class T>
Optional(Symbol * x)529   Symbol* Optional(Symbol* x) {
530     return TryOrDefault<base::Optional<T>, T>(x);
531   }
532 
CheckIf(Symbol * x)533   Symbol* CheckIf(Symbol* x) {
534     return NewSymbol({Rule({x}, YieldIntegralConstant<bool, true>),
535                       Rule({}, YieldIntegralConstant<bool, false>)});
536   }
537 
lexer()538   Lexer& lexer() { return lexer_; }
539 
540  private:
541   Lexer lexer_;
542   std::vector<std::unique_ptr<Symbol>> generated_symbols_;
543   Symbol* start_;
544 };
545 
546 }  // namespace torque
547 }  // namespace internal
548 }  // namespace v8
549 
550 #endif  // V8_TORQUE_EARLEY_PARSER_H_
551