1 // Copyright 2018 the V8 project authors. All rights reserved.
2 // Use of this source code is governed by a BSD-style license that can be
3 // found in the LICENSE file.
4
5 #include <algorithm>
6 #include <set>
7 #include <unordered_map>
8 #include <unordered_set>
9
10 #include "src/torque/ast.h"
11 #include "src/torque/earley-parser.h"
12 #include "src/torque/utils.h"
13
14 namespace v8 {
15 namespace internal {
16 namespace torque {
17
18 namespace {
19
20 struct LineAndColumnTracker {
21 LineAndColumn previous{0, 0};
22 LineAndColumn current{0, 0};
23
Advancev8::internal::torque::__anon1cbbf8af0111::LineAndColumnTracker24 void Advance(InputPosition from, InputPosition to) {
25 previous = current;
26 while (from != to) {
27 if (*from == '\n') {
28 current.line += 1;
29 current.column = 0;
30 } else {
31 current.column += 1;
32 }
33 ++from;
34 }
35 }
36
ToSourcePositionv8::internal::torque::__anon1cbbf8af0111::LineAndColumnTracker37 SourcePosition ToSourcePosition() {
38 return {CurrentSourceFile::Get(), previous, current};
39 }
40 };
41
42 } // namespace
43
RunAction(const Item * completed_item,const LexerResult & tokens) const44 base::Optional<ParseResult> Rule::RunAction(const Item* completed_item,
45 const LexerResult& tokens) const {
46 std::vector<ParseResult> results;
47 for (const Item* child : completed_item->Children()) {
48 if (!child) continue;
49 base::Optional<ParseResult> child_result =
50 child->left()->RunAction(child, tokens);
51 if (child_result) results.push_back(std::move(*child_result));
52 }
53 MatchedInput matched_input = completed_item->GetMatchedInput(tokens);
54 CurrentSourcePosition::Scope pos_scope(matched_input.pos);
55 ParseResultIterator iterator(std::move(results), matched_input);
56 return action_(&iterator);
57 }
58
operator =(std::initializer_list<Rule> rules)59 Symbol& Symbol::operator=(std::initializer_list<Rule> rules) {
60 rules_.clear();
61 for (const Rule& rule : rules) {
62 AddRule(rule);
63 }
64 return *this;
65 }
66
Children() const67 std::vector<const Item*> Item::Children() const {
68 std::vector<const Item*> children;
69 for (const Item* current = this; current->prev_; current = current->prev_) {
70 children.push_back(current->child_);
71 }
72 // The above loop collects the child nodes in reversed order.
73 std::reverse(children.begin(), children.end());
74 DCHECK_EQ(children.size(), right().size());
75 return children;
76 }
77
SplitByChildren(const LexerResult & tokens) const78 std::string Item::SplitByChildren(const LexerResult& tokens) const {
79 if (right().size() == 1) {
80 if (const Item* child = Children()[0])
81 return child->SplitByChildren(tokens);
82 }
83 std::stringstream s;
84 bool first = true;
85 for (const Item* item : Children()) {
86 if (!item) continue;
87 if (!first) s << " ";
88 s << item->GetMatchedInput(tokens).ToString();
89 first = false;
90 }
91 return s.str();
92 }
93
CheckAmbiguity(const Item & other,const LexerResult & tokens) const94 void Item::CheckAmbiguity(const Item& other, const LexerResult& tokens) const {
95 DCHECK(*this == other);
96 if (child_ != other.child_) {
97 std::stringstream s;
98 s << "Ambiguous grammer rules for \""
99 << child_->GetMatchedInput(tokens).ToString() << "\":\n "
100 << child_->SplitByChildren(tokens) << "\nvs\n "
101 << other.child_->SplitByChildren(tokens);
102 ReportError(s.str());
103 }
104 if (prev_ != other.prev_) {
105 std::stringstream s;
106 s << "Ambiguous grammer rules for \"" << GetMatchedInput(tokens).ToString()
107 << "\":\n " << SplitByChildren(tokens) << " ...\nvs\n "
108 << other.SplitByChildren(tokens) << " ...";
109 ReportError(s.str());
110 }
111 }
112
RunLexer(const std::string & input)113 LexerResult Lexer::RunLexer(const std::string& input) {
114 LexerResult result;
115 InputPosition const begin = input.c_str();
116 InputPosition const end = begin + input.size();
117 InputPosition pos = begin;
118 InputPosition token_start = pos;
119 LineAndColumnTracker line_column_tracker;
120
121 match_whitespace_(&pos);
122 line_column_tracker.Advance(token_start, pos);
123 while (pos != end) {
124 token_start = pos;
125 Symbol* symbol = MatchToken(&pos, end);
126 InputPosition token_end = pos;
127 line_column_tracker.Advance(token_start, token_end);
128 if (!symbol) {
129 CurrentSourcePosition::Scope pos_scope(
130 line_column_tracker.ToSourcePosition());
131 ReportError("Lexer Error: unknown token " +
132 StringLiteralQuote(std::string(
133 token_start, token_start + std::min<ptrdiff_t>(
134 end - token_start, 10))));
135 }
136 result.token_symbols.push_back(symbol);
137 result.token_contents.push_back(
138 {token_start, pos, line_column_tracker.ToSourcePosition()});
139 match_whitespace_(&pos);
140 line_column_tracker.Advance(token_end, pos);
141 }
142
143 // Add an additional token position to simplify corner cases.
144 line_column_tracker.Advance(token_start, pos);
145 result.token_contents.push_back(
146 {pos, pos, line_column_tracker.ToSourcePosition()});
147 return result;
148 }
149
MatchToken(InputPosition * pos,InputPosition end)150 Symbol* Lexer::MatchToken(InputPosition* pos, InputPosition end) {
151 InputPosition token_start = *pos;
152 Symbol* symbol = nullptr;
153 // Find longest matching pattern.
154 for (std::pair<const PatternFunction, Symbol>& pair : patterns_) {
155 InputPosition token_end = token_start;
156 PatternFunction matchPattern = pair.first;
157 if (matchPattern(&token_end) && token_end > *pos) {
158 *pos = token_end;
159 symbol = &pair.second;
160 }
161 }
162 size_t pattern_size = *pos - token_start;
163
164 // Now check for keywords. Prefer keywords over patterns unless the pattern is
165 // longer. Iterate from the end to ensure that if one keyword is a prefix of
166 // another, we first try to match the longer one.
167 for (auto it = keywords_.rbegin(); it != keywords_.rend(); ++it) {
168 const std::string& keyword = it->first;
169 if (static_cast<size_t>(end - token_start) < keyword.size()) continue;
170 if (keyword.size() >= pattern_size &&
171 keyword == std::string(token_start, token_start + keyword.size())) {
172 *pos = token_start + keyword.size();
173 return &it->second;
174 }
175 }
176 if (pattern_size > 0) return symbol;
177 return nullptr;
178 }
179
180 // This is an implementation of Earley's parsing algorithm
181 // (https://en.wikipedia.org/wiki/Earley_parser).
RunEarleyAlgorithm(Symbol * start,const LexerResult & tokens,std::unordered_set<Item,base::hash<Item>> * processed)182 const Item* RunEarleyAlgorithm(
183 Symbol* start, const LexerResult& tokens,
184 std::unordered_set<Item, base::hash<Item>>* processed) {
185 // Worklist for items at the current position.
186 std::vector<Item> worklist;
187 // Worklist for items at the next position.
188 std::vector<Item> future_items;
189 CurrentSourcePosition::Scope source_position(
190 SourcePosition{CurrentSourceFile::Get(), {0, 0}, {0, 0}});
191 std::vector<const Item*> completed_items;
192 std::unordered_map<std::pair<size_t, Symbol*>, std::set<const Item*>,
193 base::hash<std::pair<size_t, Symbol*>>>
194 waiting;
195
196 std::vector<const Item*> debug_trace;
197
198 // Start with one top_level symbol mapping to the start symbol of the grammar.
199 // This simplifies things because the start symbol might have several
200 // rules.
201 Symbol top_level;
202 top_level.AddRule(Rule({start}));
203 worklist.push_back(Item{top_level.rule(0), 0, 0, 0});
204
205 size_t input_length = tokens.token_symbols.size();
206
207 for (size_t pos = 0; pos <= input_length; ++pos) {
208 while (!worklist.empty()) {
209 auto insert_result = processed->insert(worklist.back());
210 const Item& item = *insert_result.first;
211 DCHECK_EQ(pos, item.pos());
212 MatchedInput last_token = tokens.token_contents[pos];
213 CurrentSourcePosition::Get() = last_token.pos;
214 bool is_new = insert_result.second;
215 if (!is_new) item.CheckAmbiguity(worklist.back(), tokens);
216 worklist.pop_back();
217 if (!is_new) continue;
218
219 debug_trace.push_back(&item);
220 if (item.IsComplete()) {
221 // 'Complete' phase: Advance all items that were waiting to match this
222 // symbol next.
223 for (const Item* parent : waiting[{item.start(), item.left()}]) {
224 worklist.push_back(parent->Advance(pos, &item));
225 }
226 } else {
227 Symbol* next = item.NextSymbol();
228 // 'Scan' phase: Check if {next} is the next symbol in the input (this
229 // is never the case if {next} is a non-terminal).
230 if (pos < tokens.token_symbols.size() &&
231 tokens.token_symbols[pos] == next) {
232 future_items.push_back(item.Advance(pos + 1, nullptr));
233 }
234 // 'Predict' phase: Add items for every rule of the non-terminal.
235 if (!next->IsTerminal()) {
236 // Remember that this item is waiting for completion with {next}.
237 waiting[{pos, next}].insert(&item);
238 }
239 for (size_t i = 0; i < next->rule_number(); ++i) {
240 Rule* rule = next->rule(i);
241 auto already_completed =
242 processed->find(Item{rule, rule->right().size(), pos, pos});
243 // As discussed in section 3 of
244 // Aycock, John, and R. Nigel Horspool. "Practical earley
245 // parsing." The Computer Journal 45.6 (2002): 620-630.
246 // Earley parsing has the following problem with epsilon rules:
247 // When we complete an item that started at the current position
248 // (that is, it matched zero tokens), we might not yet have
249 // predicted all items it can complete with. Thus we check for the
250 // existence of such items here and complete them immediately.
251 if (already_completed != processed->end()) {
252 worklist.push_back(item.Advance(pos, &*already_completed));
253 } else {
254 worklist.push_back(Item{rule, 0, pos, pos});
255 }
256 }
257 }
258 }
259 std::swap(worklist, future_items);
260 }
261
262 auto final_item =
263 processed->find(Item{top_level.rule(0), 1, 0, input_length});
264 if (final_item != processed->end()) {
265 // Success: The {top_level} rule matches the complete input.
266 return final_item->Children()[0];
267 }
268 std::string reason;
269 const Item& last_item = *debug_trace.back();
270 if (last_item.pos() < tokens.token_symbols.size()) {
271 std::string next_token = tokens.token_contents[last_item.pos()].ToString();
272 reason = "unexpected token \"" + next_token + "\"";
273 } else {
274 reason = "unexpected end of input";
275 }
276 ReportError("Parser Error: " + reason);
277 }
278
279 // static
MatchChar(int (* char_class)(int),InputPosition * pos)280 bool Grammar::MatchChar(int (*char_class)(int), InputPosition* pos) {
281 if (**pos && char_class(static_cast<unsigned char>(**pos))) {
282 ++*pos;
283 return true;
284 }
285 return false;
286 }
287
288 // static
MatchChar(bool (* char_class)(char),InputPosition * pos)289 bool Grammar::MatchChar(bool (*char_class)(char), InputPosition* pos) {
290 if (**pos && char_class(**pos)) {
291 ++*pos;
292 return true;
293 }
294 return false;
295 }
296
297 // static
MatchString(const char * s,InputPosition * pos)298 bool Grammar::MatchString(const char* s, InputPosition* pos) {
299 InputPosition current = *pos;
300 for (; *s != 0; ++s, ++current) {
301 if (*s != *current) return false;
302 }
303 *pos = current;
304 return true;
305 }
306
307 // static
MatchAnyChar(InputPosition * pos)308 bool Grammar::MatchAnyChar(InputPosition* pos) {
309 return MatchChar([](char c) { return true; }, pos);
310 }
311
312 } // namespace torque
313 } // namespace internal
314 } // namespace v8
315