1 // Copyright 2017 the V8 project authors. All rights reserved.
2 // Use of this source code is governed by a BSD-style license that can be
3 // found in the LICENSE file.
4
5 #include "src/asmjs/asm-scanner.h"
6
7 #include <cinttypes>
8
9 #include "src/flags/flags.h"
10 #include "src/numbers/conversions.h"
11 #include "src/parsing/scanner.h"
12 #include "src/strings/char-predicates-inl.h"
13
14 namespace v8 {
15 namespace internal {
16
17 namespace {
18 // Cap number of identifiers to ensure we can assign both global and
19 // local ones a token id in the range of an int32_t.
20 static const int kMaxIdentifierCount = 0xF000000;
21 } // namespace
22
AsmJsScanner(Utf16CharacterStream * stream)23 AsmJsScanner::AsmJsScanner(Utf16CharacterStream* stream)
24 : stream_(stream),
25 token_(kUninitialized),
26 preceding_token_(kUninitialized),
27 next_token_(kUninitialized),
28 position_(0),
29 preceding_position_(0),
30 next_position_(0),
31 rewind_(false),
32 in_local_scope_(false),
33 global_count_(0),
34 double_value_(0.0),
35 unsigned_value_(0),
36 preceded_by_newline_(false) {
37 #define V(name, _junk1, _junk2, _junk3) property_names_[#name] = kToken_##name;
38 STDLIB_MATH_FUNCTION_LIST(V)
39 STDLIB_ARRAY_TYPE_LIST(V)
40 #undef V
41 #define V(name, _junk1) property_names_[#name] = kToken_##name;
42 STDLIB_MATH_VALUE_LIST(V)
43 #undef V
44 #define V(name) property_names_[#name] = kToken_##name;
45 STDLIB_OTHER_LIST(V)
46 #undef V
47 #define V(name) global_names_[#name] = kToken_##name;
48 KEYWORD_NAME_LIST(V)
49 #undef V
50 Next();
51 }
52
Next()53 void AsmJsScanner::Next() {
54 if (rewind_) {
55 preceding_token_ = token_;
56 preceding_position_ = position_;
57 token_ = next_token_;
58 position_ = next_position_;
59 next_token_ = kUninitialized;
60 next_position_ = 0;
61 rewind_ = false;
62 return;
63 }
64
65 if (token_ == kEndOfInput || token_ == kParseError) {
66 return;
67 }
68
69 #if DEBUG
70 if (FLAG_trace_asm_scanner) {
71 if (Token() == kDouble) {
72 PrintF("%lf ", AsDouble());
73 } else if (Token() == kUnsigned) {
74 PrintF("%" PRIu32 " ", AsUnsigned());
75 } else {
76 std::string name = Name(Token());
77 PrintF("%s ", name.c_str());
78 }
79 }
80 #endif
81
82 preceded_by_newline_ = false;
83 preceding_token_ = token_;
84 preceding_position_ = position_;
85
86 for (;;) {
87 position_ = stream_->pos();
88 base::uc32 ch = stream_->Advance();
89 switch (ch) {
90 case ' ':
91 case '\t':
92 case '\r':
93 // Ignore whitespace.
94 break;
95
96 case '\n':
97 // Track when we've passed a newline for optional semicolon support,
98 // but keep scanning.
99 preceded_by_newline_ = true;
100 break;
101
102 case kEndOfInputU:
103 token_ = kEndOfInput;
104 return;
105
106 case '\'':
107 case '"':
108 ConsumeString(ch);
109 return;
110
111 case '/':
112 ch = stream_->Advance();
113 if (ch == '/') {
114 ConsumeCPPComment();
115 } else if (ch == '*') {
116 if (!ConsumeCComment()) {
117 token_ = kParseError;
118 return;
119 }
120 } else {
121 stream_->Back();
122 token_ = '/';
123 return;
124 }
125 // Breaks out of switch, but loops again (i.e. the case when we parsed
126 // a comment, but need to continue to look for the next token).
127 break;
128
129 case '<':
130 case '>':
131 case '=':
132 case '!':
133 ConsumeCompareOrShift(ch);
134 return;
135
136 #define V(single_char_token) case single_char_token:
137 SIMPLE_SINGLE_TOKEN_LIST(V)
138 #undef V
139 // Use fixed token IDs for ASCII.
140 token_ = ch;
141 return;
142
143 default:
144 if (IsIdentifierStart(ch)) {
145 ConsumeIdentifier(ch);
146 } else if (IsNumberStart(ch)) {
147 ConsumeNumber(ch);
148 } else {
149 // TODO(bradnelson): Support unicode (probably via UnicodeCache).
150 token_ = kParseError;
151 }
152 return;
153 }
154 }
155 }
156
Rewind()157 void AsmJsScanner::Rewind() {
158 DCHECK_NE(kUninitialized, preceding_token_);
159 // TODO(bradnelson): Currently rewinding needs to leave in place the
160 // preceding newline state (in case a |0 ends a line).
161 // This is weird and stateful, fix me.
162 DCHECK(!rewind_);
163 next_token_ = token_;
164 next_position_ = position_;
165 token_ = preceding_token_;
166 position_ = preceding_position_;
167 preceding_token_ = kUninitialized;
168 preceding_position_ = 0;
169 rewind_ = true;
170 identifier_string_.clear();
171 }
172
ResetLocals()173 void AsmJsScanner::ResetLocals() { local_names_.clear(); }
174
175 #if DEBUG
176 // Only used for debugging.
Name(token_t token) const177 std::string AsmJsScanner::Name(token_t token) const {
178 if (token >= 32 && token < 127) {
179 return std::string(1, static_cast<char>(token));
180 }
181 for (auto& i : local_names_) {
182 if (i.second == token) {
183 return i.first;
184 }
185 }
186 for (auto& i : global_names_) {
187 if (i.second == token) {
188 return i.first;
189 }
190 }
191 for (auto& i : property_names_) {
192 if (i.second == token) {
193 return i.first;
194 }
195 }
196 switch (token) {
197 #define V(rawname, name) \
198 case kToken_##name: \
199 return rawname;
200 LONG_SYMBOL_NAME_LIST(V)
201 #undef V
202 #define V(name, value, string_name) \
203 case name: \
204 return string_name;
205 SPECIAL_TOKEN_LIST(V)
206 default:
207 break;
208 #undef V
209 }
210 UNREACHABLE();
211 }
212 #endif
213
Seek(size_t pos)214 void AsmJsScanner::Seek(size_t pos) {
215 stream_->Seek(pos);
216 preceding_token_ = kUninitialized;
217 token_ = kUninitialized;
218 next_token_ = kUninitialized;
219 preceding_position_ = 0;
220 position_ = 0;
221 next_position_ = 0;
222 rewind_ = false;
223 Next();
224 }
225
ConsumeIdentifier(base::uc32 ch)226 void AsmJsScanner::ConsumeIdentifier(base::uc32 ch) {
227 // Consume characters while still part of the identifier.
228 identifier_string_.clear();
229 while (IsIdentifierPart(ch)) {
230 identifier_string_ += ch;
231 ch = stream_->Advance();
232 }
233 // Go back one for next time.
234 stream_->Back();
235
236 // Decode what the identifier means.
237 if (preceding_token_ == '.') {
238 auto i = property_names_.find(identifier_string_);
239 if (i != property_names_.end()) {
240 token_ = i->second;
241 return;
242 }
243 } else {
244 {
245 auto i = local_names_.find(identifier_string_);
246 if (i != local_names_.end()) {
247 token_ = i->second;
248 return;
249 }
250 }
251 if (!in_local_scope_) {
252 auto i = global_names_.find(identifier_string_);
253 if (i != global_names_.end()) {
254 token_ = i->second;
255 return;
256 }
257 }
258 }
259 if (preceding_token_ == '.') {
260 CHECK_LT(global_count_, kMaxIdentifierCount);
261 token_ = kGlobalsStart + global_count_++;
262 property_names_[identifier_string_] = token_;
263 } else if (in_local_scope_) {
264 CHECK_LT(local_names_.size(), kMaxIdentifierCount);
265 token_ = kLocalsStart - static_cast<token_t>(local_names_.size());
266 local_names_[identifier_string_] = token_;
267 } else {
268 CHECK_LT(global_count_, kMaxIdentifierCount);
269 token_ = kGlobalsStart + global_count_++;
270 global_names_[identifier_string_] = token_;
271 }
272 }
273
ConsumeNumber(base::uc32 ch)274 void AsmJsScanner::ConsumeNumber(base::uc32 ch) {
275 std::string number;
276 number.assign(1, ch);
277 bool has_dot = ch == '.';
278 bool has_prefix = false;
279 for (;;) {
280 ch = stream_->Advance();
281 if ((ch >= '0' && ch <= '9') || (ch >= 'a' && ch <= 'f') ||
282 (ch >= 'A' && ch <= 'F') || ch == '.' || ch == 'b' || ch == 'o' ||
283 ch == 'x' ||
284 ((ch == '-' || ch == '+') && !has_prefix &&
285 (number[number.size() - 1] == 'e' ||
286 number[number.size() - 1] == 'E'))) {
287 // TODO(bradnelson): Test weird cases ending in -.
288 if (ch == '.') {
289 has_dot = true;
290 }
291 if (ch == 'b' || ch == 'o' || ch == 'x') {
292 has_prefix = true;
293 }
294 number.push_back(ch);
295 } else {
296 break;
297 }
298 }
299 stream_->Back();
300 // Special case the most common number.
301 if (number.size() == 1 && number[0] == '0') {
302 unsigned_value_ = 0;
303 token_ = kUnsigned;
304 return;
305 }
306 // Pick out dot.
307 if (number.size() == 1 && number[0] == '.') {
308 token_ = '.';
309 return;
310 }
311 // Decode numbers.
312 double_value_ = StringToDouble(
313 base::Vector<const uint8_t>::cast(base::VectorOf(number)),
314 ALLOW_HEX | ALLOW_OCTAL | ALLOW_BINARY | ALLOW_IMPLICIT_OCTAL);
315 if (std::isnan(double_value_)) {
316 // Check if string to number conversion didn't consume all the characters.
317 // This happens if the character filter let through something invalid
318 // like: 0123ef for example.
319 // TODO(bradnelson): Check if this happens often enough to be a perf
320 // problem.
321 if (number[0] == '.') {
322 for (size_t k = 1; k < number.size(); ++k) {
323 stream_->Back();
324 }
325 token_ = '.';
326 return;
327 }
328 // Anything else that doesn't parse is an error.
329 token_ = kParseError;
330 return;
331 }
332 if (has_dot || trunc(double_value_) != double_value_) {
333 token_ = kDouble;
334 } else {
335 // Exceeding safe integer range is an error.
336 if (double_value_ > static_cast<double>(kMaxUInt32)) {
337 token_ = kParseError;
338 return;
339 }
340 unsigned_value_ = static_cast<uint32_t>(double_value_);
341 token_ = kUnsigned;
342 }
343 }
344
ConsumeCComment()345 bool AsmJsScanner::ConsumeCComment() {
346 for (;;) {
347 base::uc32 ch = stream_->Advance();
348 while (ch == '*') {
349 ch = stream_->Advance();
350 if (ch == '/') {
351 return true;
352 }
353 }
354 if (ch == '\n') {
355 preceded_by_newline_ = true;
356 }
357 if (ch == kEndOfInputU) {
358 return false;
359 }
360 }
361 }
362
ConsumeCPPComment()363 void AsmJsScanner::ConsumeCPPComment() {
364 for (;;) {
365 base::uc32 ch = stream_->Advance();
366 if (ch == '\n') {
367 preceded_by_newline_ = true;
368 return;
369 }
370 if (ch == kEndOfInputU) {
371 return;
372 }
373 }
374 }
375
ConsumeString(base::uc32 quote)376 void AsmJsScanner::ConsumeString(base::uc32 quote) {
377 // Only string allowed is 'use asm' / "use asm".
378 const char* expected = "use asm";
379 for (; *expected != '\0'; ++expected) {
380 if (stream_->Advance() != static_cast<base::uc32>(*expected)) {
381 token_ = kParseError;
382 return;
383 }
384 }
385 if (stream_->Advance() != quote) {
386 token_ = kParseError;
387 return;
388 }
389 token_ = kToken_UseAsm;
390 }
391
ConsumeCompareOrShift(base::uc32 ch)392 void AsmJsScanner::ConsumeCompareOrShift(base::uc32 ch) {
393 base::uc32 next_ch = stream_->Advance();
394 if (next_ch == '=') {
395 switch (ch) {
396 case '<':
397 token_ = kToken_LE;
398 break;
399 case '>':
400 token_ = kToken_GE;
401 break;
402 case '=':
403 token_ = kToken_EQ;
404 break;
405 case '!':
406 token_ = kToken_NE;
407 break;
408 default:
409 UNREACHABLE();
410 }
411 } else if (ch == '<' && next_ch == '<') {
412 token_ = kToken_SHL;
413 } else if (ch == '>' && next_ch == '>') {
414 if (stream_->Advance() == '>') {
415 token_ = kToken_SHR;
416 } else {
417 token_ = kToken_SAR;
418 stream_->Back();
419 }
420 } else {
421 stream_->Back();
422 token_ = ch;
423 }
424 }
425
IsIdentifierStart(base::uc32 ch)426 bool AsmJsScanner::IsIdentifierStart(base::uc32 ch) {
427 return base::IsInRange(AsciiAlphaToLower(ch), 'a', 'z') || ch == '_' ||
428 ch == '$';
429 }
430
IsIdentifierPart(base::uc32 ch)431 bool AsmJsScanner::IsIdentifierPart(base::uc32 ch) {
432 return IsAsciiIdentifier(ch);
433 }
434
IsNumberStart(base::uc32 ch)435 bool AsmJsScanner::IsNumberStart(base::uc32 ch) {
436 return ch == '.' || IsDecimalDigit(ch);
437 }
438
439 } // namespace internal
440 } // namespace v8
441