1 // Copyright 2018 the V8 project authors. All rights reserved.
2 // Use of this source code is governed by a BSD-style license that can be
3 // found in the LICENSE file.
4
5 #ifndef V8_TORQUE_EARLEY_PARSER_H_
6 #define V8_TORQUE_EARLEY_PARSER_H_
7
8 #include <map>
9 #include <memory>
10 #include <vector>
11
12 #include "src/base/optional.h"
13 #include "src/torque/contextual.h"
14 #include "src/torque/source-positions.h"
15 #include "src/torque/utils.h"
16
17 namespace v8 {
18 namespace internal {
19 namespace torque {
20
21 class Symbol;
22 class Item;
23
24 class ParseResultHolderBase {
25 public:
26 enum class TypeId;
27 virtual ~ParseResultHolderBase() = default;
28 template <class T>
29 T& Cast();
30 template <class T>
31 const T& Cast() const;
32
33 protected:
ParseResultHolderBase(TypeId type_id)34 explicit ParseResultHolderBase(TypeId type_id) : type_id_(type_id) {
35 // MSVC wrongly complains about type_id_ being an unused private field.
36 USE(type_id_);
37 }
38
39 private:
40 const TypeId type_id_;
41 };
42
43 enum class ParseResultHolderBase::TypeId {
44 kStdString,
45 kBool,
46 kInt32,
47 kDouble,
48 kIntegerLiteral,
49 kStdVectorOfString,
50 kExpressionPtr,
51 kIdentifierPtr,
52 kOptionalIdentifierPtr,
53 kStatementPtr,
54 kDeclarationPtr,
55 kTypeExpressionPtr,
56 kOptionalTypeExpressionPtr,
57 kTryHandlerPtr,
58 kNameAndTypeExpression,
59 kEnumEntry,
60 kStdVectorOfEnumEntry,
61 kImplicitParameters,
62 kOptionalImplicitParameters,
63 kNameAndExpression,
64 kAnnotation,
65 kVectorOfAnnotation,
66 kAnnotationParameter,
67 kOptionalAnnotationParameter,
68 kClassFieldExpression,
69 kStructFieldExpression,
70 kBitFieldDeclaration,
71 kStdVectorOfNameAndTypeExpression,
72 kStdVectorOfNameAndExpression,
73 kStdVectorOfClassFieldExpression,
74 kStdVectorOfStructFieldExpression,
75 kStdVectorOfBitFieldDeclaration,
76 kIncrementDecrementOperator,
77 kOptionalStdString,
78 kStdVectorOfStatementPtr,
79 kStdVectorOfDeclarationPtr,
80 kStdVectorOfStdVectorOfDeclarationPtr,
81 kStdVectorOfExpressionPtr,
82 kExpressionWithSource,
83 kParameterList,
84 kTypeList,
85 kOptionalTypeList,
86 kLabelAndTypes,
87 kStdVectorOfLabelAndTypes,
88 kStdVectorOfTryHandlerPtr,
89 kOptionalStatementPtr,
90 kOptionalExpressionPtr,
91 kTypeswitchCase,
92 kStdVectorOfTypeswitchCase,
93 kStdVectorOfIdentifierPtr,
94 kOptionalClassBody,
95 kGenericParameter,
96 kGenericParameters,
97
98 kJsonValue,
99 kJsonMember,
100 kStdVectorOfJsonValue,
101 kStdVectorOfJsonMember,
102 };
103
104 using ParseResultTypeId = ParseResultHolderBase::TypeId;
105
106 template <class T>
107 class ParseResultHolder : public ParseResultHolderBase {
108 public:
ParseResultHolder(T value)109 explicit ParseResultHolder(T value)
110 : ParseResultHolderBase(id), value_(std::move(value)) {}
111
112 private:
113 V8_EXPORT_PRIVATE static const TypeId id;
114 friend class ParseResultHolderBase;
115 T value_;
116 };
117
118 template <class T>
Cast()119 T& ParseResultHolderBase::Cast() {
120 CHECK_EQ(ParseResultHolder<T>::id, type_id_);
121 return static_cast<ParseResultHolder<T>*>(this)->value_;
122 }
123
124 template <class T>
Cast()125 const T& ParseResultHolderBase::Cast() const {
126 CHECK_EQ(ParseResultHolder<T>::id, type_id_);
127 return static_cast<const ParseResultHolder<T>*>(this)->value_;
128 }
129
130 class ParseResult {
131 public:
132 template <class T>
ParseResult(T x)133 explicit ParseResult(T x) : value_(new ParseResultHolder<T>(std::move(x))) {}
134
135 template <class T>
Cast()136 const T& Cast() const& {
137 return value_->Cast<T>();
138 }
139 template <class T>
Cast()140 T& Cast() & {
141 return value_->Cast<T>();
142 }
143 template <class T>
Cast()144 T&& Cast() && {
145 return std::move(value_->Cast<T>());
146 }
147
148 private:
149 std::unique_ptr<ParseResultHolderBase> value_;
150 };
151
152 using InputPosition = const char*;
153
154 struct MatchedInput {
MatchedInputMatchedInput155 MatchedInput(InputPosition begin, InputPosition end, SourcePosition pos)
156 : begin(begin), end(end), pos(pos) {}
157 InputPosition begin;
158 InputPosition end;
159 SourcePosition pos;
ToStringMatchedInput160 std::string ToString() const { return {begin, end}; }
161 };
162
163 class ParseResultIterator {
164 public:
ParseResultIterator(std::vector<ParseResult> results,MatchedInput matched_input)165 explicit ParseResultIterator(std::vector<ParseResult> results,
166 MatchedInput matched_input)
167 : results_(std::move(results)), matched_input_(matched_input) {}
168
169 ParseResultIterator(const ParseResultIterator&) = delete;
170 ParseResultIterator& operator=(const ParseResultIterator&) = delete;
171
Next()172 ParseResult Next() {
173 CHECK_LT(i_, results_.size());
174 return std::move(results_[i_++]);
175 }
176 template <class T>
NextAs()177 T NextAs() {
178 return std::move(Next().Cast<T>());
179 }
HasNext()180 bool HasNext() const { return i_ < results_.size(); }
181
matched_input()182 const MatchedInput& matched_input() const { return matched_input_; }
183
184 private:
185 std::vector<ParseResult> results_;
186 size_t i_ = 0;
187 MatchedInput matched_input_;
188 };
189
190 struct LexerResult {
191 std::vector<Symbol*> token_symbols;
192 std::vector<MatchedInput> token_contents;
193 };
194
195 using Action =
196 base::Optional<ParseResult> (*)(ParseResultIterator* child_results);
197
DefaultAction(ParseResultIterator * child_results)198 inline base::Optional<ParseResult> DefaultAction(
199 ParseResultIterator* child_results) {
200 if (!child_results->HasNext()) return base::nullopt;
201 return child_results->Next();
202 }
203
204 template <class T, Action action>
AsSingletonVector()205 inline Action AsSingletonVector() {
206 return [](ParseResultIterator* child_results) -> base::Optional<ParseResult> {
207 auto result = action(child_results);
208 if (!result) return result;
209 return ParseResult{std::vector<T>{(*result).Cast<T>()}};
210 };
211 }
212
213 // A rule of the context-free grammar. Each rule can have an action attached to
214 // it, which is executed after the parsing is finished.
215 class Rule final {
216 public:
217 explicit Rule(std::vector<Symbol*> right_hand_side,
218 Action action = DefaultAction)
right_hand_side_(std::move (right_hand_side))219 : right_hand_side_(std::move(right_hand_side)), action_(action) {}
220
left()221 Symbol* left() const {
222 DCHECK_NOT_NULL(left_hand_side_);
223 return left_hand_side_;
224 }
right()225 const std::vector<Symbol*>& right() const { return right_hand_side_; }
226
SetLeftHandSide(Symbol * left_hand_side)227 void SetLeftHandSide(Symbol* left_hand_side) {
228 DCHECK_NULL(left_hand_side_);
229 left_hand_side_ = left_hand_side;
230 }
231
232 V8_EXPORT_PRIVATE base::Optional<ParseResult> RunAction(
233 const Item* completed_item, const LexerResult& tokens) const;
234
235 private:
236 Symbol* left_hand_side_ = nullptr;
237 std::vector<Symbol*> right_hand_side_;
238 Action action_;
239 };
240
241 // A Symbol represents a terminal or a non-terminal of the grammar.
242 // It stores the list of rules, which have this symbol as the
243 // left-hand side.
244 // Terminals have an empty list of rules, they are created by the Lexer
245 // instead of from rules.
246 // Symbols need to reside at stable memory addresses, because the addresses are
247 // used in the parser.
248 class Symbol {
249 public:
250 Symbol() = default;
Symbol(std::initializer_list<Rule> rules)251 Symbol(std::initializer_list<Rule> rules) { *this = rules; }
252
253 // Disallow copying and moving to ensure Symbol has a stable address.
254 Symbol(const Symbol&) = delete;
255 Symbol& operator=(const Symbol&) = delete;
256
257 V8_EXPORT_PRIVATE Symbol& operator=(std::initializer_list<Rule> rules);
258
IsTerminal()259 bool IsTerminal() const { return rules_.empty(); }
rule(size_t index)260 Rule* rule(size_t index) const { return rules_[index].get(); }
rule_number()261 size_t rule_number() const { return rules_.size(); }
262
AddRule(const Rule & rule)263 void AddRule(const Rule& rule) {
264 rules_.push_back(std::make_unique<Rule>(rule));
265 rules_.back()->SetLeftHandSide(this);
266 }
267
268 V8_EXPORT_PRIVATE base::Optional<ParseResult> RunAction(
269 const Item* item, const LexerResult& tokens);
270
271 private:
272 std::vector<std::unique_ptr<Rule>> rules_;
273 };
274
275 // Items are the core datastructure of Earley's algorithm.
276 // They consist of a (partially) matched rule, a marked position inside of the
277 // right-hand side of the rule (traditionally written as a dot) and an input
278 // range from {start} to {pos} that matches the symbols of the right-hand side
279 // that are left of the mark. In addition, they store a child and a left-sibling
280 // pointer to reconstruct the AST in the end.
281 class Item {
282 public:
Item(const Rule * rule,size_t mark,size_t start,size_t pos)283 Item(const Rule* rule, size_t mark, size_t start, size_t pos)
284 : rule_(rule), mark_(mark), start_(start), pos_(pos) {
285 DCHECK_LE(mark_, right().size());
286 }
287
288 // A complete item has the mark at the right end, which means the input range
289 // matches the complete rule.
IsComplete()290 bool IsComplete() const {
291 DCHECK_LE(mark_, right().size());
292 return mark_ == right().size();
293 }
294
295 // The symbol right after the mark is expected at {pos} for this item to
296 // advance.
NextSymbol()297 Symbol* NextSymbol() const {
298 DCHECK(!IsComplete());
299 DCHECK_LT(mark_, right().size());
300 return right()[mark_];
301 }
302
303 // We successfully parsed NextSymbol() between {pos} and {new_pos}.
304 // If NextSymbol() was a non-terminal, then {child} is a pointer to a
305 // completed item for this parse.
306 // We create a new item, which moves the mark one forward.
307 Item Advance(size_t new_pos, const Item* child = nullptr) const {
308 if (child) {
309 DCHECK(child->IsComplete());
310 DCHECK_EQ(pos(), child->start());
311 DCHECK_EQ(new_pos, child->pos());
312 DCHECK_EQ(NextSymbol(), child->left());
313 }
314 Item result(rule_, mark_ + 1, start_, new_pos);
315 result.prev_ = this;
316 result.child_ = child;
317 return result;
318 }
319
320 // Collect the items representing the AST children of this completed item.
321 std::vector<const Item*> Children() const;
322 // The matched input separated according to the next branching AST level.
323 std::string SplitByChildren(const LexerResult& tokens) const;
324 // Check if {other} results in the same AST as this Item.
325 void CheckAmbiguity(const Item& other, const LexerResult& tokens) const;
326
GetMatchedInput(const LexerResult & tokens)327 MatchedInput GetMatchedInput(const LexerResult& tokens) const {
328 const MatchedInput& start = tokens.token_contents[start_];
329 const MatchedInput& end = start_ == pos_ ? tokens.token_contents[start_]
330 : tokens.token_contents[pos_ - 1];
331 CHECK_EQ(start.pos.source, end.pos.source);
332 SourcePosition combined{start.pos.source, start.pos.start, end.pos.end};
333
334 return {start.begin, end.end, combined};
335 }
336
337 // We exclude {prev_} and {child_} from equality and hash computations,
338 // because they are just globally unique data associated with an item.
339 bool operator==(const Item& other) const {
340 return rule_ == other.rule_ && mark_ == other.mark_ &&
341 start_ == other.start_ && pos_ == other.pos_;
342 }
343
hash_value(const Item & i)344 friend size_t hash_value(const Item& i) {
345 return base::hash_combine(i.rule_, i.mark_, i.start_, i.pos_);
346 }
347
rule()348 const Rule* rule() const { return rule_; }
left()349 Symbol* left() const { return rule_->left(); }
right()350 const std::vector<Symbol*>& right() const { return rule_->right(); }
pos()351 size_t pos() const { return pos_; }
start()352 size_t start() const { return start_; }
353
354 private:
355 const Rule* rule_;
356 size_t mark_;
357 size_t start_;
358 size_t pos_;
359
360 const Item* prev_ = nullptr;
361 const Item* child_ = nullptr;
362 };
363
RunAction(const Item * item,const LexerResult & tokens)364 inline base::Optional<ParseResult> Symbol::RunAction(
365 const Item* item, const LexerResult& tokens) {
366 DCHECK(item->IsComplete());
367 DCHECK_EQ(item->left(), this);
368 return item->rule()->RunAction(item, tokens);
369 }
370
371 V8_EXPORT_PRIVATE const Item* RunEarleyAlgorithm(
372 Symbol* start, const LexerResult& tokens,
373 std::unordered_set<Item, base::hash<Item>>* processed);
374
ParseTokens(Symbol * start,const LexerResult & tokens)375 inline base::Optional<ParseResult> ParseTokens(Symbol* start,
376 const LexerResult& tokens) {
377 std::unordered_set<Item, base::hash<Item>> table;
378 const Item* final_item = RunEarleyAlgorithm(start, tokens, &table);
379 return start->RunAction(final_item, tokens);
380 }
381
382 // The lexical syntax is dynamically defined while building the grammar by
383 // adding patterns and keywords to the Lexer.
384 // The term keyword here can stand for any fixed character sequence, including
385 // operators and parentheses.
386 // Each pattern or keyword automatically gets a terminal symbol associated with
387 // it. These symbols form the result of the lexing.
388 // Patterns and keywords are matched using the longest match principle. If the
389 // longest matching pattern coincides with a keyword, the keyword symbol is
390 // chosen instead of the pattern.
391 // In addition, there is a single whitespace pattern which is consumed but does
392 // not become part of the token list.
393 class Lexer {
394 public:
395 // Functions to define patterns. They try to match starting from {pos}. If
396 // successful, they return true and advance {pos}. Otherwise, {pos} stays
397 // unchanged.
398 using PatternFunction = bool (*)(InputPosition* pos);
399
SetWhitespace(PatternFunction whitespace)400 void SetWhitespace(PatternFunction whitespace) {
401 match_whitespace_ = whitespace;
402 }
403
Pattern(PatternFunction pattern)404 Symbol* Pattern(PatternFunction pattern) { return &patterns_[pattern]; }
Token(const std::string & keyword)405 Symbol* Token(const std::string& keyword) { return &keywords_[keyword]; }
406 V8_EXPORT_PRIVATE LexerResult RunLexer(const std::string& input);
407
408 private:
409 PatternFunction match_whitespace_ = [](InputPosition*) { return false; };
410 std::map<PatternFunction, Symbol> patterns_;
411 std::map<std::string, Symbol> keywords_;
412 Symbol* MatchToken(InputPosition* pos, InputPosition end);
413 };
414
415 // A grammar can have a result, which is the results of the start symbol.
416 // Grammar is intended to be subclassed, with Symbol members forming the
417 // mutually recursive rules of the grammar.
418 class Grammar {
419 public:
420 using PatternFunction = Lexer::PatternFunction;
421
Grammar(Symbol * start)422 explicit Grammar(Symbol* start) : start_(start) {}
423
Parse(const std::string & input)424 base::Optional<ParseResult> Parse(const std::string& input) {
425 LexerResult tokens = lexer().RunLexer(input);
426 return ParseTokens(start_, tokens);
427 }
428
429 protected:
Token(const std::string & s)430 Symbol* Token(const std::string& s) { return lexer_.Token(s); }
Pattern(PatternFunction pattern)431 Symbol* Pattern(PatternFunction pattern) { return lexer_.Pattern(pattern); }
SetWhitespace(PatternFunction ws)432 void SetWhitespace(PatternFunction ws) { lexer_.SetWhitespace(ws); }
433
434 // NewSymbol() allocates a fresh symbol and stores it in the current grammar.
435 // This is necessary to define helpers that create new symbols.
436 Symbol* NewSymbol(std::initializer_list<Rule> rules = {}) {
437 auto symbol = std::make_unique<Symbol>(rules);
438 Symbol* result = symbol.get();
439 generated_symbols_.push_back(std::move(symbol));
440 return result;
441 }
442
443 // Helper functions to define lexer patterns. If they match, they return true
444 // and advance {pos}. Otherwise, {pos} is unchanged.
445 V8_EXPORT_PRIVATE static bool MatchChar(int (*char_class)(int),
446 InputPosition* pos);
447 V8_EXPORT_PRIVATE static bool MatchChar(bool (*char_class)(char),
448 InputPosition* pos);
449 V8_EXPORT_PRIVATE static bool MatchAnyChar(InputPosition* pos);
450 V8_EXPORT_PRIVATE static bool MatchString(const char* s, InputPosition* pos);
451
452 // The action MatchInput() produces the input matched by the rule as
453 // result.
YieldMatchedInput(ParseResultIterator * child_results)454 static base::Optional<ParseResult> YieldMatchedInput(
455 ParseResultIterator* child_results) {
456 return ParseResult{child_results->matched_input().ToString()};
457 }
458
459 // Create a new symbol to parse the given sequence of symbols.
460 // At most one of the symbols can return a result.
Sequence(std::vector<Symbol * > symbols)461 Symbol* Sequence(std::vector<Symbol*> symbols) {
462 return NewSymbol({Rule(std::move(symbols))});
463 }
464
465 template <class T, T value>
YieldIntegralConstant(ParseResultIterator * child_results)466 static base::Optional<ParseResult> YieldIntegralConstant(
467 ParseResultIterator* child_results) {
468 return ParseResult{value};
469 }
470
471 template <class T>
YieldDefaultValue(ParseResultIterator * child_results)472 static base::Optional<ParseResult> YieldDefaultValue(
473 ParseResultIterator* child_results) {
474 return ParseResult{T{}};
475 }
476
477 template <class From, class To>
CastParseResult(ParseResultIterator * child_results)478 static base::Optional<ParseResult> CastParseResult(
479 ParseResultIterator* child_results) {
480 To result = std::move(child_results->NextAs<From>());
481 return ParseResult{std::move(result)};
482 }
483
484 // Try to parse {s} and return the result of type {Result} casted to {T}.
485 // Otherwise, the result is a default-constructed {T}.
486 template <class T, class Result = T>
TryOrDefault(Symbol * s)487 Symbol* TryOrDefault(Symbol* s) {
488 return NewSymbol({Rule({s}, CastParseResult<Result, T>),
489 Rule({}, YieldDefaultValue<T>)});
490 }
491
492 template <class T>
MakeSingletonVector(ParseResultIterator * child_results)493 static base::Optional<ParseResult> MakeSingletonVector(
494 ParseResultIterator* child_results) {
495 T x = child_results->NextAs<T>();
496 std::vector<T> result;
497 result.push_back(std::move(x));
498 return ParseResult{std::move(result)};
499 }
500
501 template <class T>
MakeExtendedVector(ParseResultIterator * child_results)502 static base::Optional<ParseResult> MakeExtendedVector(
503 ParseResultIterator* child_results) {
504 std::vector<T> l = child_results->NextAs<std::vector<T>>();
505 T x = child_results->NextAs<T>();
506 l.push_back(std::move(x));
507 return ParseResult{std::move(l)};
508 }
509
510 // For example, NonemptyList(Token("A"), Token(",")) parses any of
511 // A or A,A or A,A,A and so on.
512 template <class T>
513 Symbol* NonemptyList(Symbol* element,
514 base::Optional<Symbol*> separator = {}) {
515 Symbol* list = NewSymbol();
516 *list = {Rule({element}, MakeSingletonVector<T>),
517 separator
518 ? Rule({list, *separator, element}, MakeExtendedVector<T>)
519 : Rule({list, element}, MakeExtendedVector<T>)};
520 return list;
521 }
522
523 template <class T>
524 Symbol* List(Symbol* element, base::Optional<Symbol*> separator = {}) {
525 return TryOrDefault<std::vector<T>>(NonemptyList<T>(element, separator));
526 }
527
528 template <class T>
Optional(Symbol * x)529 Symbol* Optional(Symbol* x) {
530 return TryOrDefault<base::Optional<T>, T>(x);
531 }
532
CheckIf(Symbol * x)533 Symbol* CheckIf(Symbol* x) {
534 return NewSymbol({Rule({x}, YieldIntegralConstant<bool, true>),
535 Rule({}, YieldIntegralConstant<bool, false>)});
536 }
537
lexer()538 Lexer& lexer() { return lexer_; }
539
540 private:
541 Lexer lexer_;
542 std::vector<std::unique_ptr<Symbol>> generated_symbols_;
543 Symbol* start_;
544 };
545
546 } // namespace torque
547 } // namespace internal
548 } // namespace v8
549
550 #endif // V8_TORQUE_EARLEY_PARSER_H_
551