• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1 // Copyright 2018 the V8 project authors. All rights reserved.
2 // Use of this source code is governed by a BSD-style license that can be
3 // found in the LICENSE file.
4 
5 #include <algorithm>
6 #include <set>
7 #include <unordered_map>
8 #include <unordered_set>
9 
10 #include "src/torque/ast.h"
11 #include "src/torque/earley-parser.h"
12 #include "src/torque/utils.h"
13 
14 namespace v8 {
15 namespace internal {
16 namespace torque {
17 
18 namespace {
19 
20 struct LineAndColumnTracker {
21   LineAndColumn previous{0, 0, 0};
22   LineAndColumn current{0, 0, 0};
23 
Advancev8::internal::torque::__anon9ab08f860111::LineAndColumnTracker24   void Advance(InputPosition from, InputPosition to) {
25     previous = current;
26     current.offset += std::distance(from, to);
27     while (from != to) {
28       if (*from == '\n') {
29         current.line += 1;
30         current.column = 0;
31       } else {
32         current.column += 1;
33       }
34       ++from;
35     }
36   }
37 
ToSourcePositionv8::internal::torque::__anon9ab08f860111::LineAndColumnTracker38   SourcePosition ToSourcePosition() {
39     return {CurrentSourceFile::Get(), previous, current};
40   }
41 };
42 
43 }  // namespace
44 
RunAction(const Item * completed_item,const LexerResult & tokens) const45 base::Optional<ParseResult> Rule::RunAction(const Item* completed_item,
46                                             const LexerResult& tokens) const {
47   std::vector<ParseResult> results;
48   for (const Item* child : completed_item->Children()) {
49     if (!child) continue;
50     base::Optional<ParseResult> child_result =
51         child->left()->RunAction(child, tokens);
52     if (child_result) results.push_back(std::move(*child_result));
53   }
54   MatchedInput matched_input = completed_item->GetMatchedInput(tokens);
55   CurrentSourcePosition::Scope pos_scope(matched_input.pos);
56   ParseResultIterator iterator(std::move(results), matched_input);
57   auto result = action_(&iterator);
58   // Make sure the parse action consumed all the child results.
59   CHECK(!iterator.HasNext());
60   return result;
61 }
62 
operator =(std::initializer_list<Rule> rules)63 Symbol& Symbol::operator=(std::initializer_list<Rule> rules) {
64   rules_.clear();
65   for (const Rule& rule : rules) {
66     AddRule(rule);
67   }
68   return *this;
69 }
70 
Children() const71 std::vector<const Item*> Item::Children() const {
72   std::vector<const Item*> children;
73   for (const Item* current = this; current->prev_; current = current->prev_) {
74     children.push_back(current->child_);
75   }
76   // The above loop collects the child nodes in reversed order.
77   std::reverse(children.begin(), children.end());
78   DCHECK_EQ(children.size(), right().size());
79   return children;
80 }
81 
SplitByChildren(const LexerResult & tokens) const82 std::string Item::SplitByChildren(const LexerResult& tokens) const {
83   if (right().size() == 1) {
84     if (const Item* child = Children()[0])
85       return child->SplitByChildren(tokens);
86   }
87   std::stringstream s;
88   bool first = true;
89   for (const Item* item : Children()) {
90     if (!item) continue;
91     if (!first) s << "  ";
92     s << item->GetMatchedInput(tokens).ToString();
93     first = false;
94   }
95   return s.str();
96 }
97 
CheckAmbiguity(const Item & other,const LexerResult & tokens) const98 void Item::CheckAmbiguity(const Item& other, const LexerResult& tokens) const {
99   DCHECK(*this == other);
100   if (child_ != other.child_) {
101     std::stringstream s;
102     s << "Ambiguous grammer rules for \""
103       << child_->GetMatchedInput(tokens).ToString() << "\":\n   "
104       << child_->SplitByChildren(tokens) << "\nvs\n   "
105       << other.child_->SplitByChildren(tokens);
106     ReportError(s.str());
107   }
108   if (prev_ != other.prev_) {
109     std::stringstream s;
110     s << "Ambiguous grammer rules for \"" << GetMatchedInput(tokens).ToString()
111       << "\":\n   " << SplitByChildren(tokens) << "  ...\nvs\n   "
112       << other.SplitByChildren(tokens) << "  ...";
113     ReportError(s.str());
114   }
115 }
116 
RunLexer(const std::string & input)117 LexerResult Lexer::RunLexer(const std::string& input) {
118   LexerResult result;
119   InputPosition const begin = input.c_str();
120   InputPosition const end = begin + input.size();
121   InputPosition pos = begin;
122   InputPosition token_start = pos;
123   LineAndColumnTracker line_column_tracker;
124 
125   match_whitespace_(&pos);
126   line_column_tracker.Advance(token_start, pos);
127   while (pos != end) {
128     token_start = pos;
129     Symbol* symbol = MatchToken(&pos, end);
130     DCHECK_IMPLIES(symbol != nullptr, pos != token_start);
131     InputPosition token_end = pos;
132     line_column_tracker.Advance(token_start, token_end);
133     if (!symbol) {
134       CurrentSourcePosition::Scope pos_scope(
135           line_column_tracker.ToSourcePosition());
136       ReportError("Lexer Error: unknown token " +
137                   StringLiteralQuote(std::string(
138                       token_start, token_start + std::min<ptrdiff_t>(
139                                                      end - token_start, 10))));
140     }
141     result.token_symbols.push_back(symbol);
142     result.token_contents.push_back(
143         {token_start, pos, line_column_tracker.ToSourcePosition()});
144     match_whitespace_(&pos);
145     line_column_tracker.Advance(token_end, pos);
146   }
147 
148   // Add an additional token position to simplify corner cases.
149   line_column_tracker.Advance(token_start, pos);
150   result.token_contents.push_back(
151       {pos, pos, line_column_tracker.ToSourcePosition()});
152   return result;
153 }
154 
MatchToken(InputPosition * pos,InputPosition end)155 Symbol* Lexer::MatchToken(InputPosition* pos, InputPosition end) {
156   InputPosition token_start = *pos;
157   Symbol* symbol = nullptr;
158   // Find longest matching pattern.
159   for (std::pair<const PatternFunction, Symbol>& pair : patterns_) {
160     InputPosition token_end = token_start;
161     PatternFunction matchPattern = pair.first;
162     if (matchPattern(&token_end) && token_end > *pos) {
163       *pos = token_end;
164       symbol = &pair.second;
165     }
166   }
167   size_t pattern_size = *pos - token_start;
168 
169   // Now check for keywords. Prefer keywords over patterns unless the pattern is
170   // longer. Iterate from the end to ensure that if one keyword is a prefix of
171   // another, we first try to match the longer one.
172   for (auto it = keywords_.rbegin(); it != keywords_.rend(); ++it) {
173     const std::string& keyword = it->first;
174     if (static_cast<size_t>(end - token_start) < keyword.size()) continue;
175     if (keyword.size() >= pattern_size &&
176         keyword == std::string(token_start, token_start + keyword.size())) {
177       *pos = token_start + keyword.size();
178       return &it->second;
179     }
180   }
181   if (pattern_size > 0) return symbol;
182   return nullptr;
183 }
184 
185 // This is an implementation of Earley's parsing algorithm
186 // (https://en.wikipedia.org/wiki/Earley_parser).
RunEarleyAlgorithm(Symbol * start,const LexerResult & tokens,std::unordered_set<Item,base::hash<Item>> * processed)187 const Item* RunEarleyAlgorithm(
188     Symbol* start, const LexerResult& tokens,
189     std::unordered_set<Item, base::hash<Item>>* processed) {
190   // Worklist for items at the current position.
191   std::vector<Item> worklist;
192   // Worklist for items at the next position.
193   std::vector<Item> future_items;
194   CurrentSourcePosition::Scope source_position(
195       SourcePosition{CurrentSourceFile::Get(), LineAndColumn::Invalid(),
196                      LineAndColumn::Invalid()});
197   std::vector<const Item*> completed_items;
198   std::unordered_map<std::pair<size_t, Symbol*>, std::set<const Item*>,
199                      base::hash<std::pair<size_t, Symbol*>>>
200       waiting;
201 
202   std::vector<const Item*> debug_trace;
203 
204   // Start with one top_level symbol mapping to the start symbol of the grammar.
205   // This simplifies things because the start symbol might have several
206   // rules.
207   Symbol top_level;
208   top_level.AddRule(Rule({start}));
209   worklist.push_back(Item{top_level.rule(0), 0, 0, 0});
210 
211   size_t input_length = tokens.token_symbols.size();
212 
213   for (size_t pos = 0; pos <= input_length; ++pos) {
214     while (!worklist.empty()) {
215       auto insert_result = processed->insert(worklist.back());
216       const Item& item = *insert_result.first;
217       DCHECK_EQ(pos, item.pos());
218       MatchedInput last_token = tokens.token_contents[pos];
219       CurrentSourcePosition::Get() = last_token.pos;
220       bool is_new = insert_result.second;
221       if (!is_new) item.CheckAmbiguity(worklist.back(), tokens);
222       worklist.pop_back();
223       if (!is_new) continue;
224 
225       debug_trace.push_back(&item);
226       if (item.IsComplete()) {
227         // 'Complete' phase: Advance all items that were waiting to match this
228         // symbol next.
229         for (const Item* parent : waiting[{item.start(), item.left()}]) {
230           worklist.push_back(parent->Advance(pos, &item));
231         }
232       } else {
233         Symbol* next = item.NextSymbol();
234         // 'Scan' phase: Check if {next} is the next symbol in the input (this
235         // is never the case if {next} is a non-terminal).
236         if (pos < tokens.token_symbols.size() &&
237             tokens.token_symbols[pos] == next) {
238           future_items.push_back(item.Advance(pos + 1, nullptr));
239         }
240         // 'Predict' phase: Add items for every rule of the non-terminal.
241         if (!next->IsTerminal()) {
242           // Remember that this item is waiting for completion with {next}.
243           waiting[{pos, next}].insert(&item);
244         }
245         for (size_t i = 0; i < next->rule_number(); ++i) {
246           Rule* rule = next->rule(i);
247           auto already_completed =
248               processed->find(Item{rule, rule->right().size(), pos, pos});
249           // As discussed in section 3 of
250           //    Aycock, John, and R. Nigel Horspool. "Practical earley
251           //    parsing." The Computer Journal 45.6 (2002): 620-630.
252           // Earley parsing has the following problem with epsilon rules:
253           // When we complete an item that started at the current position
254           // (that is, it matched zero tokens), we might not yet have
255           // predicted all items it can complete with. Thus we check for the
256           // existence of such items here and complete them immediately.
257           if (already_completed != processed->end()) {
258             worklist.push_back(item.Advance(pos, &*already_completed));
259           } else {
260             worklist.push_back(Item{rule, 0, pos, pos});
261           }
262         }
263       }
264     }
265     std::swap(worklist, future_items);
266   }
267 
268   auto final_item =
269       processed->find(Item{top_level.rule(0), 1, 0, input_length});
270   if (final_item != processed->end()) {
271     // Success: The {top_level} rule matches the complete input.
272     return final_item->Children()[0];
273   }
274   std::string reason;
275   const Item& last_item = *debug_trace.back();
276   if (last_item.pos() < tokens.token_symbols.size()) {
277     std::string next_token = tokens.token_contents[last_item.pos()].ToString();
278     reason = "unexpected token \"" + next_token + "\"";
279   } else {
280     reason = "unexpected end of input";
281   }
282   ReportError("Parser Error: " + reason);
283 }
284 
285 // static
286 DISABLE_CFI_ICALL
MatchChar(int (* char_class)(int),InputPosition * pos)287 bool Grammar::MatchChar(int (*char_class)(int), InputPosition* pos) {
288   if (**pos && char_class(static_cast<unsigned char>(**pos))) {
289     ++*pos;
290     return true;
291   }
292   return false;
293 }
294 
295 // static
MatchChar(bool (* char_class)(char),InputPosition * pos)296 bool Grammar::MatchChar(bool (*char_class)(char), InputPosition* pos) {
297   if (**pos && char_class(**pos)) {
298     ++*pos;
299     return true;
300   }
301   return false;
302 }
303 
304 // static
MatchString(const char * s,InputPosition * pos)305 bool Grammar::MatchString(const char* s, InputPosition* pos) {
306   InputPosition current = *pos;
307   for (; *s != 0; ++s, ++current) {
308     if (*s != *current) return false;
309   }
310   *pos = current;
311   return true;
312 }
313 
314 // static
MatchAnyChar(InputPosition * pos)315 bool Grammar::MatchAnyChar(InputPosition* pos) {
316   return MatchChar([](char c) { return true; }, pos);
317 }
318 
319 }  // namespace torque
320 }  // namespace internal
321 }  // namespace v8
322